In [139]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint as sp_randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import itertools
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import f1_score, recall_score
from keras.callbacks import EarlyStopping
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
In [140]:
# mount your Google Drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [141]:
# Load the CSV file into a Pandas DataFrame
file_path = "/content/drive/MyDrive/Data_Mining_Coursework/Employee_Attrition.csv"
df = pd.read_csv(file_path, encoding='utf-8')
In [142]:
# Making a copy of df for later use
new_df = df.copy()
In [143]:
# Data quality check / looking for NaN
display(df.isnull().any())
Age                         False
Attrition                   False
BusinessTravel              False
DailyRate                   False
Department                  False
DistanceFromHome            False
Education                   False
EducationField              False
EmployeeCount               False
EmployeeNumber              False
EnvironmentSatisfaction     False
Gender                      False
HourlyRate                  False
JobInvolvement              False
JobLevel                    False
JobRole                     False
JobSatisfaction             False
MaritalStatus               False
MonthlyIncome               False
MonthlyRate                 False
NumCompaniesWorked          False
Over18                      False
OverTime                    False
PercentSalaryHike           False
PerformanceRating           False
RelationshipSatisfaction    False
StandardHours               False
StockOptionLevel            False
TotalWorkingYears           False
TrainingTimesLastYear       False
WorkLifeBalance             False
YearsAtCompany              False
YearsInCurrentRole          False
YearsSinceLastPromotion     False
YearsWithCurrManager        False
dtype: bool
In [144]:
# check for missing values
if df.isnull().values.any():
    print("DataFrame contains missing values")
else:
    print("DataFrame does not contain missing values")
DataFrame does not contain missing values

*Performing EDA*



In [145]:
# Check the shape of the dataset
df.shape
Out[145]:
(1470, 35)
In [146]:
# Check the data types of each column using the
df.dtypes
Out[146]:
Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole           int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object
In [147]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(9)
memory usage: 402.1+ KB
In [148]:
df.describe()
Out[148]:
Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
count 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 ... 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000
mean 36.923810 802.485714 9.192517 2.912925 1.0 1024.865306 2.721769 65.891156 2.729932 2.063946 ... 2.712245 80.0 0.793878 11.279592 2.799320 2.761224 7.008163 4.229252 2.187755 4.123129
std 9.135373 403.509100 8.106864 1.024165 0.0 602.024335 1.093082 20.329428 0.711561 1.106940 ... 1.081209 0.0 0.852077 7.780782 1.289271 0.706476 6.126525 3.623137 3.222430 3.568136
min 18.000000 102.000000 1.000000 1.000000 1.0 1.000000 1.000000 30.000000 1.000000 1.000000 ... 1.000000 80.0 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
25% 30.000000 465.000000 2.000000 2.000000 1.0 491.250000 2.000000 48.000000 2.000000 1.000000 ... 2.000000 80.0 0.000000 6.000000 2.000000 2.000000 3.000000 2.000000 0.000000 2.000000
50% 36.000000 802.000000 7.000000 3.000000 1.0 1020.500000 3.000000 66.000000 3.000000 2.000000 ... 3.000000 80.0 1.000000 10.000000 3.000000 3.000000 5.000000 3.000000 1.000000 3.000000
75% 43.000000 1157.000000 14.000000 4.000000 1.0 1555.750000 4.000000 83.750000 3.000000 3.000000 ... 4.000000 80.0 1.000000 15.000000 3.000000 3.000000 9.000000 7.000000 3.000000 7.000000
max 60.000000 1499.000000 29.000000 5.000000 1.0 2068.000000 4.000000 100.000000 4.000000 5.000000 ... 4.000000 80.0 3.000000 40.000000 6.000000 4.000000 40.000000 18.000000 15.000000 17.000000

8 rows × 26 columns

In [149]:
df['TotalWorkingYears'].describe()
Out[149]:
count    1470.000000
mean       11.279592
std         7.780782
min         0.000000
25%         6.000000
50%        10.000000
75%        15.000000
max        40.000000
Name: TotalWorkingYears, dtype: float64
In [150]:
# plot histograms of numerical features
num_cols = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition']

fig, axes = plt.subplots(nrows=5, ncols=5, figsize=(20, 25))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    sns.histplot(df[col], ax=axes[i])
    axes[i].set_title(col)

plt.tight_layout()
plt.show()
In [151]:
# separate records with 'Attrition' = Yes and 'Attrition' = No
attr_yes = df[df['Attrition'] == 'Yes']
attr_no = df[df['Attrition'] == 'No']

# plot histograms of numerical features for 'Attrition' = Yes in red
# and 'Attrition' = No in blue
num_cols = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

fig, axes = plt.subplots(nrows=5, ncols=5, figsize=(20, 25))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    sns.histplot(attr_yes[col], ax=axes[i], color='red', alpha=0.5, label='Attrition Yes')
    sns.histplot(attr_no[col], ax=axes[i], color='blue', alpha=0.5, label='Attrition No')
    axes[i].set_title(col)
    axes[i].legend()

plt.tight_layout()
plt.show()
In [152]:
# Get the number of unique values for each column
unique_counts = df.nunique()

# Identify columns with only one unique value
to_drop = [col for col in unique_counts.index if unique_counts[col] == 1]

# Add the EmployeeNumber column to the list of columns to drop
to_drop.append('EmployeeNumber')

# Drop the columns
df.drop(to_drop, axis=1, inplace=True)
In [153]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EnvironmentSatisfaction   1470 non-null   int64 
 9   Gender                    1470 non-null   object
 10  HourlyRate                1470 non-null   int64 
 11  JobInvolvement            1470 non-null   int64 
 12  JobLevel                  1470 non-null   int64 
 13  JobRole                   1470 non-null   object
 14  JobSatisfaction           1470 non-null   int64 
 15  MaritalStatus             1470 non-null   object
 16  MonthlyIncome             1470 non-null   int64 
 17  MonthlyRate               1470 non-null   int64 
 18  NumCompaniesWorked        1470 non-null   int64 
 19  OverTime                  1470 non-null   object
 20  PercentSalaryHike         1470 non-null   int64 
 21  PerformanceRating         1470 non-null   int64 
 22  RelationshipSatisfaction  1470 non-null   int64 
 23  StockOptionLevel          1470 non-null   int64 
 24  TotalWorkingYears         1470 non-null   int64 
 25  TrainingTimesLastYear     1470 non-null   int64 
 26  WorkLifeBalance           1470 non-null   int64 
 27  YearsAtCompany            1470 non-null   int64 
 28  YearsInCurrentRole        1470 non-null   int64 
 29  YearsSinceLastPromotion   1470 non-null   int64 
 30  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(23), object(8)
memory usage: 356.1+ KB
In [154]:
print(to_drop)
['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber']
In [155]:
# Select only the numerical columns
num_cols = df.select_dtypes(include=np.number).columns.tolist()

print(num_cols)
['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
In [156]:
count = len(num_cols)
print(count)
23
In [157]:
# create an empty list to store the object columns
object_col = []

# iterate through all columns in the DataFrame
for column in df.columns:
    # check if the column's data type is 'object' and if the number of unique values is less than or equal to 30
    if df[column].dtype == object and len(df[column].unique()) <= 30:
        # if the condition is met, add the column to the list of object columns
        object_col.append(column)
        # print the unique values and their counts for the column
        print(f"{column} : {df[column].unique()}")
        print(df[column].value_counts())
        print("====================================")
Attrition : ['Yes' 'No']
No     1233
Yes     237
Name: Attrition, dtype: int64
====================================
BusinessTravel : ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: BusinessTravel, dtype: int64
====================================
Department : ['Sales' 'Research & Development' 'Human Resources']
Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64
====================================
EducationField : ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
Life Sciences       606
Medical             464
Marketing           159
Technical Degree    132
Other                82
Human Resources      27
Name: EducationField, dtype: int64
====================================
Gender : ['Female' 'Male']
Male      882
Female    588
Name: Gender, dtype: int64
====================================
JobRole : ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']
Sales Executive              326
Research Scientist           292
Laboratory Technician        259
Manufacturing Director       145
Healthcare Representative    131
Manager                      102
Sales Representative          83
Research Director             80
Human Resources               52
Name: JobRole, dtype: int64
====================================
MaritalStatus : ['Single' 'Married' 'Divorced']
Married     673
Single      470
Divorced    327
Name: MaritalStatus, dtype: int64
====================================
OverTime : ['Yes' 'No']
No     1054
Yes     416
Name: OverTime, dtype: int64
====================================
In [158]:
# Creating a list of categorical attributes
cat_cols = df.select_dtypes(include='object').columns.tolist()

print(cat_cols)
['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
In [159]:
# I excluded the target variable "Attrition"
cat_cols = [ 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']

I also add to the list of categorical attribute those with inherent order

In [160]:
# Appending another 11 attributes to the cat_cols list
cat_cols1 = ['Education','EnvironmentSatisfaction','JobInvolvement','JobLevel','JobSatisfaction','PerformanceRating','RelationshipSatisfaction','StockOptionLevel','TrainingTimesLastYear','WorkLifeBalance']
for x in cat_cols1:
  cat_cols.append(x)
In [161]:
print(cat_cols)
['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime', 'Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TrainingTimesLastYear', 'WorkLifeBalance']
In [162]:
# List of numerical attributes to be scaled
num_at_scale = ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
In [163]:
# create a StandardScaler object
scaler = StandardScaler()

# fit and transform the numerical attributes
df[num_at_scale] = scaler.fit_transform(df[num_at_scale])
In [164]:
df.head()
Out[164]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EnvironmentSatisfaction Gender ... PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 0.446350 Yes Travel_Rarely 0.742527 Sales -1.010909 2 Life Sciences 2 Female ... 3 1 0 -0.421642 0 1 -0.164613 -0.063296 -0.679146 0.245834
1 1.322365 No Travel_Frequently -1.297775 Research & Development -0.147150 1 Life Sciences 3 Male ... 4 4 1 -0.164511 3 3 0.488508 0.764998 -0.368715 0.806541
2 0.008343 Yes Travel_Rarely 1.414363 Research & Development -0.887515 2 Other 4 Male ... 3 2 0 -0.550208 3 3 -1.144294 -1.167687 -0.679146 -1.155935
3 -0.429664 No Travel_Frequently 1.461466 Research & Development -0.764121 4 Life Sciences 4 Female ... 3 3 0 -0.421642 3 3 0.161947 0.764998 0.252146 -1.155935
4 -1.086676 No Travel_Rarely -0.524295 Research & Development -0.887515 1 Medical 1 Male ... 3 4 1 -0.678774 3 3 -0.817734 -0.615492 -0.058285 -0.595227

5 rows × 31 columns

In [165]:
# create a subset of the dataframe with only numerical columns
# num_df = df.select_dtypes(include=['int64', 'float64'])

# create a correlation matrix
corr_matrix = df.corr()

# plot the heatmap

plt.figure(figsize=(40, 40))
sns.heatmap(corr_matrix, cmap="YlGnBu", annot=True, linewidths=.5)
<ipython-input-165-8763b421aa5e>:5: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  corr_matrix = df.corr()
Out[165]:
<Axes: >
In [166]:
# assuming df is DataFrame with numerical features
corr_matrix = df.corr().abs()

# set diagonal elements to 0
np.fill_diagonal(corr_matrix.values, 0)

# get the indices of the maximum correlation value
idx = np.argmax(corr_matrix.values)

# get the corresponding attribute names
cols = corr_matrix.columns
max_corr = corr_matrix.values.flatten()[idx]
attr1, attr2 = np.unravel_index(idx, corr_matrix.shape)
attr1_name, attr2_name = cols[attr1], cols[attr2]

print(f"Highest correlation: {max_corr:.2f}")
print(f"Attributes: {attr1_name}, {attr2_name}")
Highest correlation: 0.95
Attributes: JobLevel, MonthlyIncome
<ipython-input-166-6d094c0b1de0>:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  corr_matrix = df.corr().abs()
In [167]:
# One-hot encoding the chosen categorical attributes
df_one_hot= pd.get_dummies(df, columns=cat_cols)
In [168]:
df_one_hot.head()
Out[168]:
Age Attrition DailyRate DistanceFromHome HourlyRate MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike TotalWorkingYears ... TrainingTimesLastYear_1 TrainingTimesLastYear_2 TrainingTimesLastYear_3 TrainingTimesLastYear_4 TrainingTimesLastYear_5 TrainingTimesLastYear_6 WorkLifeBalance_1 WorkLifeBalance_2 WorkLifeBalance_3 WorkLifeBalance_4
0 0.446350 Yes 0.742527 -1.010909 1.383138 -0.108350 0.726020 2.125136 -1.150554 -0.421642 ... 0 0 0 0 0 0 1 0 0 0
1 1.322365 No -1.297775 -0.147150 -0.240677 -0.291719 1.488876 -0.678049 2.129306 -0.164511 ... 0 0 1 0 0 0 0 0 1 0
2 0.008343 Yes 1.414363 -0.887515 1.284725 -0.937654 -1.674841 1.324226 -0.057267 -0.550208 ... 0 0 1 0 0 0 0 0 1 0
3 -0.429664 No 1.461466 -0.764121 -0.486709 -0.763634 1.243211 -0.678049 -1.150554 -0.421642 ... 0 0 1 0 0 0 0 0 1 0
4 -1.086676 No -0.524295 -0.887515 -1.274014 -0.644858 0.325900 2.525591 -0.877232 -0.678774 ... 0 0 1 0 0 0 0 0 1 0

5 rows × 85 columns

In [169]:
df_one_hot.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 85 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                1470 non-null   float64
 1   Attrition                          1470 non-null   object 
 2   DailyRate                          1470 non-null   float64
 3   DistanceFromHome                   1470 non-null   float64
 4   HourlyRate                         1470 non-null   float64
 5   MonthlyIncome                      1470 non-null   float64
 6   MonthlyRate                        1470 non-null   float64
 7   NumCompaniesWorked                 1470 non-null   float64
 8   PercentSalaryHike                  1470 non-null   float64
 9   TotalWorkingYears                  1470 non-null   float64
 10  YearsAtCompany                     1470 non-null   float64
 11  YearsInCurrentRole                 1470 non-null   float64
 12  YearsSinceLastPromotion            1470 non-null   float64
 13  YearsWithCurrManager               1470 non-null   float64
 14  BusinessTravel_Non-Travel          1470 non-null   uint8  
 15  BusinessTravel_Travel_Frequently   1470 non-null   uint8  
 16  BusinessTravel_Travel_Rarely       1470 non-null   uint8  
 17  Department_Human Resources         1470 non-null   uint8  
 18  Department_Research & Development  1470 non-null   uint8  
 19  Department_Sales                   1470 non-null   uint8  
 20  EducationField_Human Resources     1470 non-null   uint8  
 21  EducationField_Life Sciences       1470 non-null   uint8  
 22  EducationField_Marketing           1470 non-null   uint8  
 23  EducationField_Medical             1470 non-null   uint8  
 24  EducationField_Other               1470 non-null   uint8  
 25  EducationField_Technical Degree    1470 non-null   uint8  
 26  Gender_Female                      1470 non-null   uint8  
 27  Gender_Male                        1470 non-null   uint8  
 28  JobRole_Healthcare Representative  1470 non-null   uint8  
 29  JobRole_Human Resources            1470 non-null   uint8  
 30  JobRole_Laboratory Technician      1470 non-null   uint8  
 31  JobRole_Manager                    1470 non-null   uint8  
 32  JobRole_Manufacturing Director     1470 non-null   uint8  
 33  JobRole_Research Director          1470 non-null   uint8  
 34  JobRole_Research Scientist         1470 non-null   uint8  
 35  JobRole_Sales Executive            1470 non-null   uint8  
 36  JobRole_Sales Representative       1470 non-null   uint8  
 37  MaritalStatus_Divorced             1470 non-null   uint8  
 38  MaritalStatus_Married              1470 non-null   uint8  
 39  MaritalStatus_Single               1470 non-null   uint8  
 40  OverTime_No                        1470 non-null   uint8  
 41  OverTime_Yes                       1470 non-null   uint8  
 42  Education_1                        1470 non-null   uint8  
 43  Education_2                        1470 non-null   uint8  
 44  Education_3                        1470 non-null   uint8  
 45  Education_4                        1470 non-null   uint8  
 46  Education_5                        1470 non-null   uint8  
 47  EnvironmentSatisfaction_1          1470 non-null   uint8  
 48  EnvironmentSatisfaction_2          1470 non-null   uint8  
 49  EnvironmentSatisfaction_3          1470 non-null   uint8  
 50  EnvironmentSatisfaction_4          1470 non-null   uint8  
 51  JobInvolvement_1                   1470 non-null   uint8  
 52  JobInvolvement_2                   1470 non-null   uint8  
 53  JobInvolvement_3                   1470 non-null   uint8  
 54  JobInvolvement_4                   1470 non-null   uint8  
 55  JobLevel_1                         1470 non-null   uint8  
 56  JobLevel_2                         1470 non-null   uint8  
 57  JobLevel_3                         1470 non-null   uint8  
 58  JobLevel_4                         1470 non-null   uint8  
 59  JobLevel_5                         1470 non-null   uint8  
 60  JobSatisfaction_1                  1470 non-null   uint8  
 61  JobSatisfaction_2                  1470 non-null   uint8  
 62  JobSatisfaction_3                  1470 non-null   uint8  
 63  JobSatisfaction_4                  1470 non-null   uint8  
 64  PerformanceRating_3                1470 non-null   uint8  
 65  PerformanceRating_4                1470 non-null   uint8  
 66  RelationshipSatisfaction_1         1470 non-null   uint8  
 67  RelationshipSatisfaction_2         1470 non-null   uint8  
 68  RelationshipSatisfaction_3         1470 non-null   uint8  
 69  RelationshipSatisfaction_4         1470 non-null   uint8  
 70  StockOptionLevel_0                 1470 non-null   uint8  
 71  StockOptionLevel_1                 1470 non-null   uint8  
 72  StockOptionLevel_2                 1470 non-null   uint8  
 73  StockOptionLevel_3                 1470 non-null   uint8  
 74  TrainingTimesLastYear_0            1470 non-null   uint8  
 75  TrainingTimesLastYear_1            1470 non-null   uint8  
 76  TrainingTimesLastYear_2            1470 non-null   uint8  
 77  TrainingTimesLastYear_3            1470 non-null   uint8  
 78  TrainingTimesLastYear_4            1470 non-null   uint8  
 79  TrainingTimesLastYear_5            1470 non-null   uint8  
 80  TrainingTimesLastYear_6            1470 non-null   uint8  
 81  WorkLifeBalance_1                  1470 non-null   uint8  
 82  WorkLifeBalance_2                  1470 non-null   uint8  
 83  WorkLifeBalance_3                  1470 non-null   uint8  
 84  WorkLifeBalance_4                  1470 non-null   uint8  
dtypes: float64(13), object(1), uint8(71)
memory usage: 262.8+ KB
In [170]:
# Replace 'Yes' with 1 and 'No' with 0 in the 'Attrition' column
df_one_hot['Attrition'] = df_one_hot['Attrition'].map({'Yes': 1, 'No': 0})
In [171]:
# Calculate correlations with the target variable
correlations = df_one_hot.corr()['Attrition'].sort_values(ascending=False)
# Set max_rows to None to display the entire list
pd.set_option('display.max_rows', None)
# Print correlations
print(correlations)
Attrition                            1.000000
OverTime_Yes                         0.246118
JobLevel_1                           0.212551
StockOptionLevel_0                   0.195342
MaritalStatus_Single                 0.175419
JobRole_Sales Representative         0.157234
EnvironmentSatisfaction_1            0.122819
JobInvolvement_1                     0.117161
BusinessTravel_Travel_Frequently     0.115143
WorkLifeBalance_1                    0.098689
JobRole_Laboratory Technician        0.098290
JobSatisfaction_1                    0.090329
Department_Sales                     0.080855
DistanceFromHome                     0.077924
EducationField_Technical Degree      0.069355
TrainingTimesLastYear_0              0.061894
RelationshipSatisfaction_1           0.059222
EducationField_Marketing             0.055781
JobInvolvement_2                     0.044731
NumCompaniesWorked                   0.043494
TrainingTimesLastYear_4              0.041216
TrainingTimesLastYear_2              0.037544
EducationField_Human Resources       0.036466
JobRole_Human Resources              0.036215
Gender_Male                          0.029453
Education_3                          0.025723
Education_1                          0.020777
JobRole_Sales Executive              0.019774
Department_Human Resources           0.016832
MonthlyRate                          0.015170
WorkLifeBalance_4                    0.014131
WorkLifeBalance_2                    0.011093
StockOptionLevel_3                   0.010271
JobSatisfaction_3                    0.007015
JobSatisfaction_2                    0.004038
PerformanceRating_4                  0.002889
JobRole_Research Scientist          -0.000360
PerformanceRating_3                 -0.002889
HourlyRate                          -0.006846
Education_2                         -0.006884
RelationshipSatisfaction_3          -0.011984
PercentSalaryHike                   -0.013478
EnvironmentSatisfaction_2           -0.015267
JobLevel_3                          -0.016380
RelationshipSatisfaction_2          -0.017611
EducationField_Other                -0.017898
TrainingTimesLastYear_1             -0.021113
RelationshipSatisfaction_4          -0.022940
Education_4                         -0.025676
Education_5                         -0.028507
Gender_Female                       -0.029453
EducationField_Life Sciences        -0.032703
YearsSinceLastPromotion             -0.033019
TrainingTimesLastYear_5             -0.035170
TrainingTimesLastYear_3             -0.039854
TrainingTimesLastYear_6             -0.040309
EnvironmentSatisfaction_3           -0.044209
EducationField_Medical              -0.046999
EnvironmentSatisfaction_4           -0.047909
BusinessTravel_Travel_Rarely        -0.049538
JobLevel_5                          -0.053566
JobInvolvement_3                    -0.056213
DailyRate                           -0.056652
JobInvolvement_4                    -0.063577
WorkLifeBalance_3                   -0.064301
BusinessTravel_Non-Travel           -0.074457
JobRole_Healthcare Representative   -0.078696
StockOptionLevel_2                  -0.080472
JobRole_Manufacturing Director      -0.082994
JobRole_Manager                     -0.083316
Department_Research & Development   -0.085293
JobLevel_4                          -0.086461
MaritalStatus_Divorced              -0.087716
JobSatisfaction_4                   -0.087830
JobRole_Research Director           -0.088870
MaritalStatus_Married               -0.090984
JobLevel_2                          -0.131138
YearsAtCompany                      -0.134392
StockOptionLevel_1                  -0.151049
YearsWithCurrManager                -0.156199
Age                                 -0.159205
MonthlyIncome                       -0.159840
YearsInCurrentRole                  -0.160545
TotalWorkingYears                   -0.171063
OverTime_No                         -0.246118
Name: Attrition, dtype: float64
In [172]:
# Separating independent and dependent attributes
X = df_one_hot.drop(['Attrition'], axis = 1)
y = df_one_hot['Attrition']
In [173]:
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 84 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                1470 non-null   float64
 1   DailyRate                          1470 non-null   float64
 2   DistanceFromHome                   1470 non-null   float64
 3   HourlyRate                         1470 non-null   float64
 4   MonthlyIncome                      1470 non-null   float64
 5   MonthlyRate                        1470 non-null   float64
 6   NumCompaniesWorked                 1470 non-null   float64
 7   PercentSalaryHike                  1470 non-null   float64
 8   TotalWorkingYears                  1470 non-null   float64
 9   YearsAtCompany                     1470 non-null   float64
 10  YearsInCurrentRole                 1470 non-null   float64
 11  YearsSinceLastPromotion            1470 non-null   float64
 12  YearsWithCurrManager               1470 non-null   float64
 13  BusinessTravel_Non-Travel          1470 non-null   uint8  
 14  BusinessTravel_Travel_Frequently   1470 non-null   uint8  
 15  BusinessTravel_Travel_Rarely       1470 non-null   uint8  
 16  Department_Human Resources         1470 non-null   uint8  
 17  Department_Research & Development  1470 non-null   uint8  
 18  Department_Sales                   1470 non-null   uint8  
 19  EducationField_Human Resources     1470 non-null   uint8  
 20  EducationField_Life Sciences       1470 non-null   uint8  
 21  EducationField_Marketing           1470 non-null   uint8  
 22  EducationField_Medical             1470 non-null   uint8  
 23  EducationField_Other               1470 non-null   uint8  
 24  EducationField_Technical Degree    1470 non-null   uint8  
 25  Gender_Female                      1470 non-null   uint8  
 26  Gender_Male                        1470 non-null   uint8  
 27  JobRole_Healthcare Representative  1470 non-null   uint8  
 28  JobRole_Human Resources            1470 non-null   uint8  
 29  JobRole_Laboratory Technician      1470 non-null   uint8  
 30  JobRole_Manager                    1470 non-null   uint8  
 31  JobRole_Manufacturing Director     1470 non-null   uint8  
 32  JobRole_Research Director          1470 non-null   uint8  
 33  JobRole_Research Scientist         1470 non-null   uint8  
 34  JobRole_Sales Executive            1470 non-null   uint8  
 35  JobRole_Sales Representative       1470 non-null   uint8  
 36  MaritalStatus_Divorced             1470 non-null   uint8  
 37  MaritalStatus_Married              1470 non-null   uint8  
 38  MaritalStatus_Single               1470 non-null   uint8  
 39  OverTime_No                        1470 non-null   uint8  
 40  OverTime_Yes                       1470 non-null   uint8  
 41  Education_1                        1470 non-null   uint8  
 42  Education_2                        1470 non-null   uint8  
 43  Education_3                        1470 non-null   uint8  
 44  Education_4                        1470 non-null   uint8  
 45  Education_5                        1470 non-null   uint8  
 46  EnvironmentSatisfaction_1          1470 non-null   uint8  
 47  EnvironmentSatisfaction_2          1470 non-null   uint8  
 48  EnvironmentSatisfaction_3          1470 non-null   uint8  
 49  EnvironmentSatisfaction_4          1470 non-null   uint8  
 50  JobInvolvement_1                   1470 non-null   uint8  
 51  JobInvolvement_2                   1470 non-null   uint8  
 52  JobInvolvement_3                   1470 non-null   uint8  
 53  JobInvolvement_4                   1470 non-null   uint8  
 54  JobLevel_1                         1470 non-null   uint8  
 55  JobLevel_2                         1470 non-null   uint8  
 56  JobLevel_3                         1470 non-null   uint8  
 57  JobLevel_4                         1470 non-null   uint8  
 58  JobLevel_5                         1470 non-null   uint8  
 59  JobSatisfaction_1                  1470 non-null   uint8  
 60  JobSatisfaction_2                  1470 non-null   uint8  
 61  JobSatisfaction_3                  1470 non-null   uint8  
 62  JobSatisfaction_4                  1470 non-null   uint8  
 63  PerformanceRating_3                1470 non-null   uint8  
 64  PerformanceRating_4                1470 non-null   uint8  
 65  RelationshipSatisfaction_1         1470 non-null   uint8  
 66  RelationshipSatisfaction_2         1470 non-null   uint8  
 67  RelationshipSatisfaction_3         1470 non-null   uint8  
 68  RelationshipSatisfaction_4         1470 non-null   uint8  
 69  StockOptionLevel_0                 1470 non-null   uint8  
 70  StockOptionLevel_1                 1470 non-null   uint8  
 71  StockOptionLevel_2                 1470 non-null   uint8  
 72  StockOptionLevel_3                 1470 non-null   uint8  
 73  TrainingTimesLastYear_0            1470 non-null   uint8  
 74  TrainingTimesLastYear_1            1470 non-null   uint8  
 75  TrainingTimesLastYear_2            1470 non-null   uint8  
 76  TrainingTimesLastYear_3            1470 non-null   uint8  
 77  TrainingTimesLastYear_4            1470 non-null   uint8  
 78  TrainingTimesLastYear_5            1470 non-null   uint8  
 79  TrainingTimesLastYear_6            1470 non-null   uint8  
 80  WorkLifeBalance_1                  1470 non-null   uint8  
 81  WorkLifeBalance_2                  1470 non-null   uint8  
 82  WorkLifeBalance_3                  1470 non-null   uint8  
 83  WorkLifeBalance_4                  1470 non-null   uint8  
dtypes: float64(13), uint8(71)
memory usage: 251.3 KB
In [174]:
y.info()
<class 'pandas.core.series.Series'>
RangeIndex: 1470 entries, 0 to 1469
Series name: Attrition
Non-Null Count  Dtype
--------------  -----
1470 non-null   int64
dtypes: int64(1)
memory usage: 11.6 KB
In [175]:
y.info()
<class 'pandas.core.series.Series'>
RangeIndex: 1470 entries, 0 to 1469
Series name: Attrition
Non-Null Count  Dtype
--------------  -----
1470 non-null   int64
dtypes: int64(1)
memory usage: 11.6 KB

MODEL BUILDING



In [176]:
# This will split the data into training and test sets, while ensuring that the proportion of the minority class is maintained in both sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

PERFORMING DATA AUGMENTATION



In [177]:
# The target variable is heavily imbalanced, as observed earlier. To address this issue, we can utilize the SMOTE method from the imblearn package in Python to oversample the minority class.
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
  1. LOGISTIC REGRESSION


In [178]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV

# # create logistic regression model
# model = LogisticRegression(random_state=42, max_iter=1000)

# # define hyperparameters to search over
# hyperparameters = {'C': [0.01, 0.1, 1, 10],
#                    'penalty': [ 'l2', None],#,,'l1'
#                    'solver': ['lbfgs','saga',  'sag'],#
#                    'class_weight': [None, 'balanced']}

# # perform grid search cross-validation
# grid_search = GridSearchCV(model, hyperparameters, cv=5, n_jobs=-1)
# grid_search.fit(X_train_smote, y_train_smote)

# # print best hyperparameters
# print(grid_search.best_params_)

# # predict on test data using best model
# y_pred = grid_search.predict(X_test)

# # evaluate model performance on test data
# from sklearn.metrics import classification_report
# print(classification_report(y_test, y_pred))
In [179]:
# create logistic regression model
lr_smote = LogisticRegression(random_state=42, max_iter=1000, class_weight = 'balanced' )

# train model on resampled data
lr_smote.fit(X_train_smote, y_train_smote)

# predict on test data
y_pred_lr_smote = lr_smote.predict(X_test)

lr_smote_metrics = classification_report(y_test, y_pred_lr_smote) 
# evaluate model performance on test data
print(classification_report(y_test, y_pred_lr_smote))
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       247
           1       0.70      0.49      0.57        47

    accuracy                           0.88       294
   macro avg       0.80      0.72      0.75       294
weighted avg       0.87      0.88      0.88       294

In [180]:
# get predicted probabilities for positive class
y_score_lr_smote = lr_smote.predict_proba(X_test)[:, 1]

# calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_score_lr_smote)

# calculate AUC score
roc_auc = auc(fpr, tpr)
# set the figure size
plt.figure(figsize=(8, 6))
# plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Logistic Regression (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
In [181]:
# get predicted labels
y_pred_lr_smote = lr_smote.predict(X_test)

# calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_lr_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. DECISION TREE CLASSIFIER


In [182]:
# from sklearn.model_selection import GridSearchCV

# # create Decision Tree model
# dt_smote = DecisionTreeClassifier(random_state=42)

# # define hyperparameter grid
# param_grid = {
#     'criterion': ['gini', 'entropy'],
#     'splitter': ['best', 'random'],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': [None, 'sqrt', 'log2'],
#     'class_weight': [None, 'balanced'],
#     'ccp_alpha': [0.0, 0.1, 0.01]
# }

# # perform grid search with 5-fold cross validation
# dt_smote_grid = GridSearchCV(dt_smote, param_grid, cv=5)

# # fit grid search on training data
# dt_smote_grid.fit(X_train_smote, y_train_smote)

# # predict on test data using best model
# y_pred = dt_smote_grid.predict(X_test)

# # evaluate model performance on test data
# dt_smote_metrics = classification_report(y_test, y_pred)
# print(dt_smote_metrics)

# # print best hyperparameters
# print("Best hyperparameters:", dt_smote_grid.best_params_)
In [183]:
# create Decision Tree model
dt_smote = DecisionTreeClassifier(random_state=42, max_depth = 4,min_samples_leaf = 1, class_weight = 'balanced' )

# train model on resampled data
dt_smote.fit(X_train_smote, y_train_smote)

# predict on test data
y_pred_dt_smote = dt_smote.predict(X_test)

# evaluate model performance on test data
dt_smote_metrics = classification_report(y_test, y_pred_dt_smote)
print(dt_smote_metrics)
              precision    recall  f1-score   support

           0       0.90      0.87      0.88       247
           1       0.42      0.49      0.45        47

    accuracy                           0.81       294
   macro avg       0.66      0.68      0.67       294
weighted avg       0.82      0.81      0.82       294

In [184]:
from sklearn.tree import plot_tree

# Plot the decision tree
plt.figure(figsize=(20,10))
plot_tree(dt_smote, filled=True, feature_names=X.columns, class_names=['0', '1'], fontsize=10)
plt.show()
In [185]:
# get predicted probabilities for positive class
y_prob_dt_smote = dt_smote.predict_proba(X_test)[:, 1]

# calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_dt_smote)

# calculate AUC score
auc = roc_auc_score(y_test, y_prob_dt_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Decision Tree (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
In [186]:
# get predicted labels
y_pred_dt_smote = dt_smote.predict(X_test)

# calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_dt_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. RANDOM FOREST


In [187]:
# # create a random forest classifier
# rf = RandomForestClassifier(random_state=42)

# # define the hyperparameter grid
# param_grid = {
#     'n_estimators': [50, 100, 200, 300],
#     'max_depth': [5, 10, 15, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # create the grid search object
# grid_rf = GridSearchCV(rf, param_grid, cv=5)


# # fit the grid search object to the data
# grid_rf.fit(X_train_smote, y_train_smote)

# # print the best parameters and the corresponding score
# print("Best parameters: ", grid_rf.best_params_)
# print("Best score: ", grid_rf.best_score_)
In [188]:
# create Random Forest model
rf_smote = RandomForestClassifier(random_state=42, max_depth = 4, class_weight = 'balanced', n_estimators=100)

# train model on train data
rf_smote.fit(X_train_smote, y_train_smote)

# predict on test data
y_pred_rf_smote = rf_smote.predict(X_test)

rf_smote_metrics = classification_report(y_test, y_pred_rf_smote)
# evaluate model performance on test data
print(rf_smote_metrics)
              precision    recall  f1-score   support

           0       0.90      0.87      0.89       247
           1       0.43      0.49      0.46        47

    accuracy                           0.81       294
   macro avg       0.66      0.68      0.67       294
weighted avg       0.82      0.81      0.82       294

In [189]:
# Plot the first tree in the random forest
plt.figure(figsize=(20, 10))
plot_tree(rf_smote.estimators_[0], filled=True, feature_names=X.columns, class_names=['0', '1'], fontsize=10)
plt.show()
In [190]:
# get predicted probabilities for positive class
y_prob_rf_smote = rf_smote.predict_proba(X_test)[:, 1]

# calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_rf_smote)

# calculate AUC score
auc = roc_auc_score(y_test, y_prob_rf_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Random Forest (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
In [191]:
# get predicted labels
y_pred_rf_smote = rf_smote.predict(X_test)

# calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_rf_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

4.GRADIENT BOOSTING



In [192]:
# # define the model
# model = GradientBoostingClassifier()

# # define the hyperparameter space to search
# param_dist = {'n_estimators': sp_randint(50, 200),
#               'max_depth': sp_randint(2, 10),
#               'min_samples_split': sp_randint(2, 10),
#               'min_samples_leaf': sp_randint(1, 10),
#               'learning_rate': [0.1, 0.01, 0.001]}

# # perform randomized search with 10 iterations
# random_search = RandomizedSearchCV(model, param_distributions=param_dist,
#                                    n_iter=10, cv=5, scoring='f1', random_state=42)

# # fit the randomized search object to the data
# random_search.fit(X_train_smote, y_train_smote)

# # print the best hyperparameters and corresponding f1 score
# print('Best hyperparameters:', random_search.best_params_)
# print('Corresponding f1 score:', random_search.best_score_)
In [193]:
# Create Gradient Boosting Classifier
gb_smote = GradientBoostingClassifier(random_state=42)

# Fit the model to the training data
gb_smote.fit(X_train_smote, y_train_smote)

# Predict the target variable for test data
y_pred_gb_smote = gb_smote.predict(X_test)

gb_smote_metrics = classification_report(y_test, y_pred_gb_smote) 
# evaluate model performance on test data
print(gb_smote_metrics)
              precision    recall  f1-score   support

           0       0.88      0.96      0.92       247
           1       0.58      0.32      0.41        47

    accuracy                           0.85       294
   macro avg       0.73      0.64      0.66       294
weighted avg       0.83      0.85      0.84       294

In [194]:
# Get predicted probabilities for positive class
y_prob_gb_smote = gb_smote.predict_proba(X_test)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_gb_smote)

# Calculate AUC score
auc = roc_auc_score(y_test, y_prob_gb_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Gradient Boosting (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
In [195]:
# Get predicted labels
y_pred_gb_smote = gb_smote.predict(X_test)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_gb_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# Create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. XGBoost CLASSIFIER


In [196]:
# # import necessary libraries
# from sklearn.model_selection import GridSearchCV

# # define the parameter grid to search
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.1, 0.01, 0.001],
#     'subsample': [0.5, 0.7, 1],
#     'colsample_bytree': [0.5, 0.7, 1]
# }

# # create XGBoost Classifier
# xgb_smote = xgb.XGBClassifier(random_state=42)

# # create GridSearchCV object
# grid_search = GridSearchCV(estimator=xgb_smote, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# # fit the GridSearchCV object to the data
# grid_search.fit(X_train_smote, y_train_smote)

# # get the best hyperparameters and the corresponding score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# # print the best hyperparameters and the corresponding score
# print('Best hyperparameters: ', best_params)
# print('Best score: ', best_score)

# # predict on test data using the best model
# best_xgb_smote = grid_search.best_estimator_
# y_pred = best_xgb_smote.predict(X_test)

# # evaluate model performance on test data
# xgb_smote_metrics = classification_report(y_test, y_pred)
# print(xgb_smote_metrics)
In [197]:
# Create XGBoost Classifier
xgb_smote = xgb.XGBClassifier(random_state=42)

# Fit the model to the training data
xgb_smote.fit(X_train_smote, y_train_smote)

# Predict the target variable for test data
y_pred_xgb_smote = xgb_smote.predict(X_test)

xgb_smote_metrics = classification_report(y_test, y_pred_xgb_smote) 
# evaluate model performance on test data
print(xgb_smote_metrics)
              precision    recall  f1-score   support

           0       0.88      0.95      0.91       247
           1       0.54      0.30      0.38        47

    accuracy                           0.85       294
   macro avg       0.71      0.62      0.65       294
weighted avg       0.82      0.85      0.83       294

In [198]:
# get predicted probabilities for positive class
y_prob_xgb_smote = xgb_smote.predict_proba(X_test)[:, 1]

# calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_xgb_smote)

# calculate AUC score
auc = roc_auc_score(y_test, y_prob_xgb_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'XGBoost (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
In [199]:
# get predicted labels
y_pred_xgb_smote = xgb_smote.predict(X_test)

# calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_xgb_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
In [200]:
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
# Create AdaBoost model
ada_smote = AdaBoostClassifier(random_state=42)

# Train the model on the training data
ada_smote.fit(X_train_smote, y_train_smote)

# Predict on the test data
y_pred_ada_smote = ada_smote.predict(X_test)

# Generate classification report
ada_smote_metrics = classification_report(y_test, y_pred_ada_smote, zero_division=1)
print(ada_smote_metrics)
              precision    recall  f1-score   support

           0       0.90      0.91      0.90       247
           1       0.48      0.45      0.46        47

    accuracy                           0.83       294
   macro avg       0.69      0.68      0.68       294
weighted avg       0.83      0.83      0.83       294

In [201]:
# get predicted probabilities for positive class
y_prob_ada_smote = ada_smote.predict_proba(X_test)[:, 1]

# calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_ada_smote)

# calculate AUC score
auc = roc_auc_score(y_test, y_prob_ada_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AdaBoost (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
In [202]:
# get predicted labels
y_pred_ada_smote = ada_smote.predict(X_test)

# calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_ada_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
In [203]:
# Creat new df without the target attribute
df_one_hot1 = df_one_hot.drop('Attrition', axis=1)
In [204]:
# Check
df_one_hot1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 84 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                1470 non-null   float64
 1   DailyRate                          1470 non-null   float64
 2   DistanceFromHome                   1470 non-null   float64
 3   HourlyRate                         1470 non-null   float64
 4   MonthlyIncome                      1470 non-null   float64
 5   MonthlyRate                        1470 non-null   float64
 6   NumCompaniesWorked                 1470 non-null   float64
 7   PercentSalaryHike                  1470 non-null   float64
 8   TotalWorkingYears                  1470 non-null   float64
 9   YearsAtCompany                     1470 non-null   float64
 10  YearsInCurrentRole                 1470 non-null   float64
 11  YearsSinceLastPromotion            1470 non-null   float64
 12  YearsWithCurrManager               1470 non-null   float64
 13  BusinessTravel_Non-Travel          1470 non-null   uint8  
 14  BusinessTravel_Travel_Frequently   1470 non-null   uint8  
 15  BusinessTravel_Travel_Rarely       1470 non-null   uint8  
 16  Department_Human Resources         1470 non-null   uint8  
 17  Department_Research & Development  1470 non-null   uint8  
 18  Department_Sales                   1470 non-null   uint8  
 19  EducationField_Human Resources     1470 non-null   uint8  
 20  EducationField_Life Sciences       1470 non-null   uint8  
 21  EducationField_Marketing           1470 non-null   uint8  
 22  EducationField_Medical             1470 non-null   uint8  
 23  EducationField_Other               1470 non-null   uint8  
 24  EducationField_Technical Degree    1470 non-null   uint8  
 25  Gender_Female                      1470 non-null   uint8  
 26  Gender_Male                        1470 non-null   uint8  
 27  JobRole_Healthcare Representative  1470 non-null   uint8  
 28  JobRole_Human Resources            1470 non-null   uint8  
 29  JobRole_Laboratory Technician      1470 non-null   uint8  
 30  JobRole_Manager                    1470 non-null   uint8  
 31  JobRole_Manufacturing Director     1470 non-null   uint8  
 32  JobRole_Research Director          1470 non-null   uint8  
 33  JobRole_Research Scientist         1470 non-null   uint8  
 34  JobRole_Sales Executive            1470 non-null   uint8  
 35  JobRole_Sales Representative       1470 non-null   uint8  
 36  MaritalStatus_Divorced             1470 non-null   uint8  
 37  MaritalStatus_Married              1470 non-null   uint8  
 38  MaritalStatus_Single               1470 non-null   uint8  
 39  OverTime_No                        1470 non-null   uint8  
 40  OverTime_Yes                       1470 non-null   uint8  
 41  Education_1                        1470 non-null   uint8  
 42  Education_2                        1470 non-null   uint8  
 43  Education_3                        1470 non-null   uint8  
 44  Education_4                        1470 non-null   uint8  
 45  Education_5                        1470 non-null   uint8  
 46  EnvironmentSatisfaction_1          1470 non-null   uint8  
 47  EnvironmentSatisfaction_2          1470 non-null   uint8  
 48  EnvironmentSatisfaction_3          1470 non-null   uint8  
 49  EnvironmentSatisfaction_4          1470 non-null   uint8  
 50  JobInvolvement_1                   1470 non-null   uint8  
 51  JobInvolvement_2                   1470 non-null   uint8  
 52  JobInvolvement_3                   1470 non-null   uint8  
 53  JobInvolvement_4                   1470 non-null   uint8  
 54  JobLevel_1                         1470 non-null   uint8  
 55  JobLevel_2                         1470 non-null   uint8  
 56  JobLevel_3                         1470 non-null   uint8  
 57  JobLevel_4                         1470 non-null   uint8  
 58  JobLevel_5                         1470 non-null   uint8  
 59  JobSatisfaction_1                  1470 non-null   uint8  
 60  JobSatisfaction_2                  1470 non-null   uint8  
 61  JobSatisfaction_3                  1470 non-null   uint8  
 62  JobSatisfaction_4                  1470 non-null   uint8  
 63  PerformanceRating_3                1470 non-null   uint8  
 64  PerformanceRating_4                1470 non-null   uint8  
 65  RelationshipSatisfaction_1         1470 non-null   uint8  
 66  RelationshipSatisfaction_2         1470 non-null   uint8  
 67  RelationshipSatisfaction_3         1470 non-null   uint8  
 68  RelationshipSatisfaction_4         1470 non-null   uint8  
 69  StockOptionLevel_0                 1470 non-null   uint8  
 70  StockOptionLevel_1                 1470 non-null   uint8  
 71  StockOptionLevel_2                 1470 non-null   uint8  
 72  StockOptionLevel_3                 1470 non-null   uint8  
 73  TrainingTimesLastYear_0            1470 non-null   uint8  
 74  TrainingTimesLastYear_1            1470 non-null   uint8  
 75  TrainingTimesLastYear_2            1470 non-null   uint8  
 76  TrainingTimesLastYear_3            1470 non-null   uint8  
 77  TrainingTimesLastYear_4            1470 non-null   uint8  
 78  TrainingTimesLastYear_5            1470 non-null   uint8  
 79  TrainingTimesLastYear_6            1470 non-null   uint8  
 80  WorkLifeBalance_1                  1470 non-null   uint8  
 81  WorkLifeBalance_2                  1470 non-null   uint8  
 82  WorkLifeBalance_3                  1470 non-null   uint8  
 83  WorkLifeBalance_4                  1470 non-null   uint8  
dtypes: float64(13), uint8(71)
memory usage: 251.3 KB

PERFORMING PCA



In [205]:
# Perform PCA to reduce dimensionality
pca = PCA().fit(X)  # X data matrix


# Plot the cumulative explained variance ratio for all the principal components
plt.figure(figsize=(5, 5))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.show()

# Determine the number of components needed to explain 80-90% variance
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance_ratio >= 0.8) + 1
print("Number of components to keep 80% variance:", n_components)

n_components = np.argmax(cumulative_variance_ratio >= 0.9) + 1
print("Number of components to keep 90% variance:", n_components)
Number of components to keep 80% variance: 25
Number of components to keep 90% variance: 36
In [206]:
# From the above graph, i cocnclude that the number of principal components to choose is 36
In [207]:
# Fit PCA to training AUGMENTED data
pca = PCA(n_components=36)
X_train_pca_smote = pca.fit_transform(X_train_smote)

# Transform test data using the same PCA object
X_test_pca = pca.transform(X_test)
  1. PCA (SMOTE) - LOGISTIC REGRESSION


In [208]:
# create logistic regression model
lr_pca_smote = LogisticRegression(random_state=42, max_iter=1000, class_weight = 'balanced' )

# train model on resampled data
lr_pca_smote.fit(X_train_pca_smote, y_train_smote)

# predict on test data
y_pred_lr_pca_smote = lr_pca_smote.predict(X_test_pca)

lr_pca_smote_metrics = classification_report(y_test, y_pred_lr_pca_smote)
# evaluate model performance on test data
print(lr_pca_smote_metrics)
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       247
           1       0.55      0.47      0.51        47

    accuracy                           0.85       294
   macro avg       0.73      0.70      0.71       294
weighted avg       0.85      0.85      0.85       294

In [209]:
# get predicted probabilities for positive class
y_prob_lr_pca_smote = lr_pca_smote.predict_proba(X_test_pca)[:, 1]

# calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_lr_pca_smote)

# calculate AUC score
auc = roc_auc_score(y_test, y_prob_lr_pca_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Logistic Regression (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
In [210]:
# get predicted labels
y_pred_lr_pca_smote = lr_pca_smote.predict(X_test_pca)

# calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_lr_pca_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. PCA (SMOTE) - DECISION TREE


In [211]:
# create Decision Tree model
dt_pca_smote = DecisionTreeClassifier(random_state=42,  max_depth = 4,min_samples_leaf = 1, class_weight = 'balanced')

# train model on resampled data
dt_pca_smote.fit(X_train_pca_smote, y_train_smote)

# predict on test data
y_pred_dt_pca_smote = dt_pca_smote.predict(X_test_pca)

# evaluate model performance on test data
dt_pca_smote_metrics = classification_report(y_test, y_pred_dt_pca_smote)
print(dt_pca_smote_metrics)
              precision    recall  f1-score   support

           0       0.93      0.77      0.84       247
           1       0.37      0.70      0.48        47

    accuracy                           0.76       294
   macro avg       0.65      0.74      0.66       294
weighted avg       0.84      0.76      0.78       294

In [212]:
# get predicted probabilities for positive class
y_prob_lr_pca_smote = lr_pca_smote.predict_proba(X_test_pca)[:, 1]

# calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_lr_pca_smote)

# calculate AUC score
auc = roc_auc_score(y_test, y_prob_lr_pca_smote)

# set the figure size
plt.figure(figsize=(8, 6))

# plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Logistic Regression (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
In [213]:
# get predicted labels
y_pred_lr_pca_smote = lr_pca_smote.predict(X_test_pca)

# calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_lr_pca_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. PCA (SMOTE) - RANDOM FOREST


In [214]:
# create Random Forest model
rf_pca_smote = RandomForestClassifier(random_state=42, criterion='gini', max_depth = 10, min_samples_leaf = 5, min_samples_split = 4, n_estimators = 100)

# train model on train data
rf_pca_smote.fit(X_train_pca_smote, y_train_smote)

# predict on test data
y_pred_rf_pca_smote = rf_pca_smote.predict(X_test_pca)

rf_pca_smote_metrics = classification_report(y_test, y_pred_rf_pca_smote)
# evaluate model performance on test data
print(rf_pca_smote_metrics)
              precision    recall  f1-score   support

           0       0.88      0.95      0.91       247
           1       0.54      0.30      0.38        47

    accuracy                           0.85       294
   macro avg       0.71      0.62      0.65       294
weighted avg       0.82      0.85      0.83       294

In [215]:
# create Random Forest model
rf_pca_smote = RandomForestClassifier(random_state=42, criterion='gini', max_depth=10, min_samples_leaf=5, min_samples_split=4, n_estimators=100)

# train model on train data
rf_pca_smote.fit(X_train_pca_smote, y_train_smote)

# get predicted probabilities for positive class
y_prob_rf_pca_smote = rf_pca_smote.predict_proba(X_test_pca)[:, 1]

# calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_rf_pca_smote)

# calculate AUC score
auc = roc_auc_score(y_test, y_prob_rf_pca_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Random Forest (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [216]:
# create Random Forest model
rf_pca_smote = RandomForestClassifier(random_state=42, criterion='gini', max_depth=10, min_samples_leaf=5, min_samples_split=4, n_estimators=100)

# train model on train data
rf_pca_smote.fit(X_train_pca_smote, y_train_smote)

# predict on test data
y_pred_rf_pca_smote = rf_pca_smote.predict(X_test_pca)

# calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_rf_pca_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. PCA (SMOTE) - GRADIENT BOOSTING


In [217]:
# Create Gradient Boosting Classifier
gb_pca_smote = GradientBoostingClassifier(random_state=42)

# Fit the model to the training data
gb_pca_smote.fit(X_train_pca_smote, y_train_smote)

# Predict the target variable for test data
y_pred_gb_pca_smote = gb_pca_smote.predict(X_test_pca)

gb_pca_smote_metrics = classification_report(y_test, y_pred_gb_pca_smote) 
# evaluate model performance on test data
print(gb_pca_smote_metrics)
              precision    recall  f1-score   support

           0       0.89      0.90      0.89       247
           1       0.43      0.40      0.42        47

    accuracy                           0.82       294
   macro avg       0.66      0.65      0.66       294
weighted avg       0.82      0.82      0.82       294

In [218]:
# Create Gradient Boosting Classifier
gb_pca_smote = GradientBoostingClassifier(random_state=42)

# Fit the model to the training data
gb_pca_smote.fit(X_train_pca_smote, y_train_smote)

# Predict the target variable probabilities for test data
y_prob_gb_pca_smote = gb_pca_smote.predict_proba(X_test_pca)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_gb_pca_smote)

# Calculate AUC score
auc = roc_auc_score(y_test, y_prob_gb_pca_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Gradient Boosting (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [219]:
# Create Gradient Boosting Classifier
gb_pca_smote = GradientBoostingClassifier(random_state=42)

# Fit the model to the training data
gb_pca_smote.fit(X_train_pca_smote, y_train_smote)

# Predict the target variable for test data
y_pred_gb_pca_smote = gb_pca_smote.predict(X_test_pca)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_gb_pca_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# Create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. PCA (SMOTE) - XGBoost CLASSIFIER


In [220]:
# Create XGBoost Classifier
xgb_pca_smote = xgb.XGBClassifier(random_state=42)

# Fit the model to the training data
xgb_pca_smote.fit(X_train_pca_smote, y_train_smote)

# Predict the target variable for test data
y_pred_xgb_pca_smote = xgb_pca_smote.predict(X_test_pca)

xgb_pca_smote_metrics = classification_report(y_test, y_pred_xgb_pca_smote)
# evaluate model performance on test data
print(xgb_pca_smote_metrics)
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       247
           1       0.49      0.38      0.43        47

    accuracy                           0.84       294
   macro avg       0.69      0.65      0.67       294
weighted avg       0.82      0.84      0.83       294

In [221]:
# Create XGBoost Classifier
xgb_pca_smote = xgb.XGBClassifier(random_state=42)

# Fit the model to the training data
xgb_pca_smote.fit(X_train_pca_smote, y_train_smote)

# Predict the target variable probabilities for test data
y_prob_xgb_pca_smote = xgb_pca_smote.predict_proba(X_test_pca)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_xgb_pca_smote)

# Calculate AUC score
auc = roc_auc_score(y_test, y_prob_xgb_pca_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'XGBoost (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [222]:
# Create XGBoost Classifier
xgb_pca_smote = xgb.XGBClassifier(random_state=42)

# Fit the model to the training data
xgb_pca_smote.fit(X_train_pca_smote, y_train_smote)

# Predict the target variable for test data
y_pred_xgb_pca_smote = xgb_pca_smote.predict(X_test_pca)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_xgb_pca_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# Create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
In [222]:

RECURSIVE FEATURE ELIMINATION



In [223]:
# define the model to use for feature selection
model = RandomForestClassifier(n_estimators=100, random_state=42)

# define the RFE method and specify the number of features to select
rfe = RFE(model, n_features_to_select=20)

# fit the RFE model to the training data
rfe.fit(X_train_smote, y_train_smote)

# extract the selected feature indices
selected_features = rfe.get_support(indices=True)

# convert selected_features to a comma-separated string
selected_features_str = ','.join(map(str, selected_features))

# subset the training and testing data to include only the selected features
X_train_rfe = X_train_smote.iloc[:, selected_features]
X_test_rfe = X_test.iloc[:, selected_features]

# Print the selected_features as a comma-separated list
print(selected_features_str)
0,1,2,3,4,5,6,7,8,9,10,11,12,36,37,39,49,62,70,82
In [224]:
# Create a new Random Forest Classifier using only the selected features
model = RandomForestClassifier(n_estimators=100, random_state=42)
X_train_selected = X_train_smote.iloc[:, selected_features]
X_test_selected = X_test.iloc[:, selected_features]

# Train the model on the selected features
model.fit(X_train_selected, y_train_smote)

# Make predictions on the test data using the trained model
y_pred = model.predict(X_test_selected)

# Evaluate the performance of the model
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.87      0.94      0.90       247
           1       0.41      0.23      0.30        47

    accuracy                           0.82       294
   macro avg       0.64      0.58      0.60       294
weighted avg       0.79      0.82      0.80       294

In [225]:
from sklearn.linear_model import LinearRegression

# Define the model to use for feature selection
model = LinearRegression()

# Define the RFE method and specify the number of features to select
rfe = RFE(model, n_features_to_select=20)

# Fit the RFE model to the training data
rfe.fit(X_train, y_train)

# Extract the selected feature indices
selected_features = rfe.get_support(indices=True)

# Convert selected_features to a comma-separated string
selected_features_str = ','.join(map(str, selected_features))

# Subset the training and testing data to include only the selected features
X_train_rfe = X_train.iloc[:, selected_features]
X_test_rfe = X_test.iloc[:, selected_features]

# Print the selected_features as a comma-separated list
print(selected_features_str)
19,20,21,22,23,24,54,55,56,57,58,65,67,73,74,75,76,77,78,79
In [226]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create a new Linear Regression model using only the selected features
model = LinearRegression()
X_train_selected = X_train.iloc[:, selected_features]
X_test_selected = X_test.iloc[:, selected_features]

# Train the model on the selected features
model.fit(X_train_selected, y_train)

# Make predictions on the test data using the trained model
y_pred = model.predict(X_test_selected)

# Evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
Mean Squared Error: 0.12468495336519618

WITHOUT DATA AUGMENTATION / SMOTE



  1. LOGISTIC REGRESSION


In [227]:
# create logistic regression model
lr = LogisticRegression(random_state=42, max_iter=1000, class_weight = None )

# train model on resampled data
lr.fit(X_train, y_train)

# predict on test data
y_pred_lr = lr.predict(X_test)

lr_metrics = classification_report(y_test, y_pred_lr)
# evaluate model performance on test data
print(lr_metrics)
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       247
           1       0.71      0.47      0.56        47

    accuracy                           0.88       294
   macro avg       0.81      0.72      0.75       294
weighted avg       0.87      0.88      0.87       294

In [228]:
# Create logistic regression model
lr = LogisticRegression(random_state=42, max_iter=1000, class_weight=None)

# Train model on resampled data
lr.fit(X_train, y_train)

# Get predicted probabilities for positive class
y_prob_lr = lr.predict_proba(X_test)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_lr)

# Calculate AUC score
auc = roc_auc_score(y_test, y_prob_lr)
# set the figure size
plt.figure(figsize=(8, 6))
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Logistic Regression (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [229]:
# Create logistic regression model
lr = LogisticRegression(random_state=42, max_iter=1000, class_weight=None)

# Train model on resampled data
lr.fit(X_train, y_train)

# Predict on test data
y_pred_lr = lr.predict(X_test)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_lr)
# set the figure size
plt.figure(figsize=(8, 6))
# Create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. DECISION TREE


In [230]:
# from sklearn.model_selection import GridSearchCV

# # create Decision Tree model
# dt = DecisionTreeClassifier(random_state=42)

# # define hyperparameter grid
# param_grid = {
#     'criterion': ['gini', 'entropy'],
#     'splitter': ['best', 'random'],
#     'max_depth': [None, 10, 20, 25],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': [None, 'sqrt', 'log2'],
#     'class_weight': [None, 'balanced'],
#     'ccp_alpha': [0.0, 0.1, 0.01]
# }

# # perform grid search with 5-fold cross validation
# dt_grid = GridSearchCV(dt, param_grid, cv=5)

# # fit grid search on training data
# dt_grid.fit(X_train, y_train)

# # predict on test data using best model
# y_pred = dt_grid.predict(X_test)

# # evaluate model performance on test data
# dt_metrics = classification_report(y_test, y_pred)
# print(dt_metrics)

# # print best hyperparameters
# print("Best hyperparameters:", dt_grid.best_params_)
In [231]:
# create Decision Tree model
dt = DecisionTreeClassifier(random_state=42, criterion='entropy', splitter='random', max_depth=100, min_samples_split=10, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=None, max_leaf_nodes=None, class_weight=None, ccp_alpha=0.0)

# train model on resampled data
dt.fit(X_train, y_train)

# predict on test data
y_pred_dt = dt.predict(X_test)

# evaluate model performance on test data
dt_metrics = classification_report(y_test, y_pred_dt)
print(dt_metrics)
              precision    recall  f1-score   support

           0       0.88      0.90      0.89       247
           1       0.40      0.36      0.38        47

    accuracy                           0.81       294
   macro avg       0.64      0.63      0.64       294
weighted avg       0.80      0.81      0.81       294

In [232]:
# Create Decision Tree model
dt = DecisionTreeClassifier(random_state=42, criterion='entropy', splitter='random', max_depth=100, min_samples_split=10, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=None, max_leaf_nodes=None, class_weight=None, ccp_alpha=0.0)

# Train model on resampled data
dt.fit(X_train, y_train)

# Get predicted probabilities for positive class
y_prob_dt = dt.predict_proba(X_test)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_dt)

# Calculate AUC score
auc = roc_auc_score(y_test, y_prob_dt)
# set the figure size
plt.figure(figsize=(8, 6))
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Decision Tree (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [233]:
# Create Decision Tree model
dt = DecisionTreeClassifier(random_state=42, criterion='entropy', splitter='random', max_depth=100, min_samples_split=10, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_features=None, max_leaf_nodes=None, class_weight=None, ccp_alpha=0.0)

# Train model on resampled data
dt.fit(X_train, y_train)

# Predict on test data
y_pred_dt = dt.predict(X_test)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_dt)
# set the figure size
plt.figure(figsize=(8, 6))
# Create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
In [234]:
# # create a random forest classifier
# rf = RandomForestClassifier(random_state=42)

# # define the hyperparameter grid
# param_grid = {
#     'n_estimators': [50, 100, 200, 300],
#     'max_depth': [5, 10, 15, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # create the grid search object
# grid_rf = GridSearchCV(rf, param_grid, cv=5)


# # fit the grid search object to the data
# grid_rf.fit(X_train, y_train)

# # print the best parameters and the corresponding score
# print("Best parameters: ", grid_rf.best_params_)
# print("Best score: ", grid_rf.best_score_)
  1. RANDOM FOREST CLASSIFIER


In [235]:
# create Random Forest model
rf = RandomForestClassifier(random_state=42, criterion='gini', max_depth = 10, min_samples_leaf = 1, min_samples_split = 5, n_estimators = 50)

# train model on train data
rf.fit(X_train, y_train)

# predict on test data
y_pred_rf = rf.predict(X_test)

rf_metrics = classification_report(y_test, y_pred_rf)
# evaluate model performance on test data
print(rf_metrics)
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       247
           1       0.71      0.11      0.19        47

    accuracy                           0.85       294
   macro avg       0.78      0.55      0.55       294
weighted avg       0.83      0.85      0.80       294

In [236]:
# Create Random Forest model
rf = RandomForestClassifier(random_state=42, criterion='gini', max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=50)

# Train model on train data
rf.fit(X_train, y_train)

# Get predicted probabilities for positive class
y_prob_rf = rf.predict_proba(X_test)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_rf)

# Calculate AUC score
auc = roc_auc_score(y_test, y_prob_rf)
# set the figure size
plt.figure(figsize=(8, 6))
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Random Forest (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [237]:
# Create Random Forest model
rf = RandomForestClassifier(random_state=42, criterion='gini', max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=50)

# Train model on train data
rf.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf.predict(X_test)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)
# set the figure size
plt.figure(figsize=(8, 6))
# Create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. GRADIENT BOOSTING


In [238]:
# Create Gradient Boosting Classifier
gb = GradientBoostingClassifier(random_state=42)

# Fit the model to the training data
gb.fit(X_train, y_train)

# Predict the target variable for test data
y_pred_gb = gb.predict(X_test)

gb_metrics = classification_report(y_test, y_pred_gb) 
# evaluate model performance on test data
print(gb_metrics)
              precision    recall  f1-score   support

           0       0.86      0.98      0.91       247
           1       0.57      0.17      0.26        47

    accuracy                           0.85       294
   macro avg       0.72      0.57      0.59       294
weighted avg       0.81      0.85      0.81       294

In [239]:
# Create Gradient Boosting Classifier
gb = GradientBoostingClassifier(random_state=42)

# Fit the model to the training data
gb.fit(X_train, y_train)

# Get predicted probabilities for positive class
y_prob_gb = gb.predict_proba(X_test)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_gb)

# Calculate AUC score
auc = roc_auc_score(y_test, y_prob_gb)
# set the figure size
plt.figure(figsize=(8, 6))
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Gradient Boosting (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [240]:
# Create Gradient Boosting Classifier
gb = GradientBoostingClassifier(random_state=42)

# Fit the model to the training data
gb.fit(X_train, y_train)

# Predict the target variable for test data
y_pred_gb = gb.predict(X_test)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_gb)
# set the figure size
plt.figure(figsize=(8, 6))
# Create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. XGBoost CLASSIFIER


In [241]:
# Create XGBoost Classifier
xgb = xgb.XGBClassifier(random_state=42)

# Fit the model to the training data
xgb.fit(X_train, y_train)

# Predict the target variable for test data
y_pred_xgb = xgb.predict(X_test)

xgb_metrics = classification_report(y_test, y_pred_xgb) 
# evaluate model performance on test data
print(xgb_metrics)
              precision    recall  f1-score   support

           0       0.87      0.97      0.92       247
           1       0.61      0.23      0.34        47

    accuracy                           0.85       294
   macro avg       0.74      0.60      0.63       294
weighted avg       0.83      0.85      0.83       294

PERFORMING PCA TO NON-AUGMENTED DATA



In [242]:
# Fit PCA to training NON-AUGMENTED data
pca = PCA(n_components=60)
X_train_pca = pca.fit_transform(X_train)

# Transform test data using the same PCA object
X_test_pca = pca.transform(X_test)
  1. LOGISTIC REGRESSION PCA


In [243]:
# create logistic regression model
lr_pca = LogisticRegression(random_state=42, max_iter=1000, class_weight = None )

# train model on resampled data
lr_pca.fit(X_train_pca, y_train)

# predict on test data
y_pred_lr_pca = lr.predict(X_test)

lr_pca_metrics = classification_report(y_test, y_pred_lr_pca)
# evaluate model performance on test data
print(lr_pca_metrics)
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       247
           1       0.71      0.47      0.56        47

    accuracy                           0.88       294
   macro avg       0.81      0.72      0.75       294
weighted avg       0.87      0.88      0.87       294

In [244]:
# Create logistic regression model with PCA
lr_pca = LogisticRegression(random_state=42, max_iter=1000, class_weight=None)

# Train model on resampled data
lr_pca.fit(X_train_pca, y_train)

# Get predicted probabilities for positive class
y_prob_lr_pca = lr_pca.predict_proba(X_test_pca)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_lr_pca)

# Calculate AUC score
auc = roc_auc_score(y_test, y_prob_lr_pca)
# set the figure size
plt.figure(figsize=(8, 6))
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Logistic Regression with PCA (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [245]:
# Create logistic regression model with PCA
lr_pca = LogisticRegression(random_state=42, max_iter=1000, class_weight=None)

# Train model on resampled data
lr_pca.fit(X_train_pca, y_train)

# Predict the target variable for test data
y_pred_lr_pca = lr_pca.predict(X_test_pca)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_lr_pca)
# set the figure size
plt.figure(figsize=(8, 6))
# Create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. DECISION TREE PCA


In [246]:
# create Decision Tree model
dt_pca = DecisionTreeClassifier(random_state=42, criterion='gini', splitter='random', max_depth=10, min_samples_split=10, min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features=None, max_leaf_nodes=None, class_weight=None, ccp_alpha=0.0)

# train model on resampled data
dt_pca.fit(X_train_pca, y_train)

# predict on test data
y_pred_dt_pca = dt_pca.predict(X_test_pca)

# evaluate model performance on test data
dt_pca_metrics = classification_report(y_test, y_pred_dt_pca)
print(dt_pca_metrics)
              precision    recall  f1-score   support

           0       0.88      0.92      0.90       247
           1       0.43      0.32      0.37        47

    accuracy                           0.82       294
   macro avg       0.65      0.62      0.63       294
weighted avg       0.80      0.82      0.81       294

In [247]:
# Create Decision Tree model with PCA
dt_pca = DecisionTreeClassifier(random_state=42, criterion='gini', splitter='random', max_depth=10, min_samples_split=10, min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features=None, max_leaf_nodes=None, class_weight=None, ccp_alpha=0.0)

# Train model on resampled data
dt_pca.fit(X_train_pca, y_train)

# Get predicted probabilities for positive class
y_prob_dt_pca = dt_pca.predict_proba(X_test_pca)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_dt_pca)

# Calculate AUC score
auc = roc_auc_score(y_test, y_prob_dt_pca)
# set the figure size
plt.figure(figsize=(8, 6))
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Decision Tree with PCA (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [248]:
# Create Decision Tree model with PCA
dt_pca = DecisionTreeClassifier(random_state=42, criterion='gini', splitter='random', max_depth=10, min_samples_split=10, min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features=None, max_leaf_nodes=None, class_weight=None, ccp_alpha=0.0)

# Train model on resampled data
dt_pca.fit(X_train_pca, y_train)

# Predict the target variable for test data
y_pred_dt_pca = dt_pca.predict(X_test_pca)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_dt_pca)
# set the figure size
plt.figure(figsize=(8, 6))
# Create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
In [249]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import GridSearchCV

# # create Decision Tree model
# dt_pca = DecisionTreeClassifier(random_state=42)

# # define range of alpha values to test
# params = {'ccp_alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]}

# # perform grid search to find optimal value of alpha
# grid_search = GridSearchCV(dt_pca, params, cv=5)
# grid_search.fit(X_train_pca, y_train)

# # create pruned decision tree model with optimal alpha value
# dt_pca_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=grid_search.best_params_['ccp_alpha'])
# dt_pca_pruned.fit(X_train_pca, y_train)

# # predict on test data
# y_pred_dt_pca_pruned = dt_pca_pruned.predict(X_test_pca)

# # evaluate model performance on test data
# dt_pca_pruned_metrics = classification_report(y_test, y_pred_dt_pca_pruned)
# print(dt_pca_pruned_metrics)
  1. RANDOM FOREST CLASSIFIER PCA


In [250]:
# create Random Forest model
rf_pca = RandomForestClassifier(random_state=42, criterion='gini', max_depth = 10, min_samples_leaf = 5, min_samples_split = 4, n_estimators = 50)

# train model on train data
rf_pca.fit(X_train_pca, y_train)

# predict on test data
y_pred_rf_pca = rf_pca.predict(X_test_pca)

rf_pca_metrics = classification_report(y_test, y_pred_rf_pca)
# evaluate model performance on test data
print(rf_pca_metrics)
              precision    recall  f1-score   support

           0       0.85      1.00      0.91       247
           1       0.67      0.04      0.08        47

    accuracy                           0.84       294
   macro avg       0.76      0.52      0.50       294
weighted avg       0.82      0.84      0.78       294

In [251]:
# Create Random Forest model with PCA
rf_pca = RandomForestClassifier(random_state=42, criterion='gini', max_depth=10, min_samples_leaf=5, min_samples_split=4, n_estimators=50)

# Train model on train data
rf_pca.fit(X_train_pca, y_train)

# Get predicted probabilities for positive class
y_prob_rf_pca = rf_pca.predict_proba(X_test_pca)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_rf_pca)

# Calculate AUC score
auc = roc_auc_score(y_test, y_prob_rf_pca)
# set the figure size
plt.figure(figsize=(8, 6))
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Random Forest with PCA (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [252]:
# Create Random Forest model with PCA
rf_pca = RandomForestClassifier(random_state=42, criterion='gini', max_depth=10, min_samples_leaf=5, min_samples_split=4, n_estimators=50)

# Train model on train data
rf_pca.fit(X_train_pca, y_train)

# Predict the target variable for test data
y_pred_rf_pca = rf_pca.predict(X_test_pca)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_rf_pca)
# set the figure size
plt.figure(figsize=(8, 6))
# Create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. GRADIENT BOOSTING PCA


In [253]:
# Create Gradient Boosting Classifier
gb_pca = GradientBoostingClassifier(random_state=42)

# Fit the model to the training data
gb_pca.fit(X_train_pca, y_train)

# Predict the target variable for test data
y_pred_gb_pca = gb_pca.predict(X_test_pca)

gb_pca_metrics = classification_report(y_test, y_pred_gb_pca) 
# evaluate model performance on test data
print(gb_pca_metrics)
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       247
           1       0.50      0.19      0.28        47

    accuracy                           0.84       294
   macro avg       0.68      0.58      0.59       294
weighted avg       0.80      0.84      0.81       294

In [254]:
# Create Gradient Boosting Classifier with PCA
gb_pca = GradientBoostingClassifier(random_state=42)

# Fit the model to the training data
gb_pca.fit(X_train_pca, y_train)

# Get predicted probabilities for positive class
y_prob_gb_pca = gb_pca.predict_proba(X_test_pca)[:, 1]

# Calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob_gb_pca)

# Calculate AUC score
auc = roc_auc_score(y_test, y_prob_gb_pca)
# set the figure size
plt.figure(figsize=(8, 6))
# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Gradient Boosting with PCA (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
In [255]:
# Create Gradient Boosting Classifier with PCA
gb_pca = GradientBoostingClassifier(random_state=42)

# Fit the model to the training data
gb_pca.fit(X_train_pca, y_train)

# Predict the target variable for test data
y_pred_gb_pca = gb_pca.predict(X_test_pca)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_gb_pca)
# set the figure size
plt.figure(figsize=(8, 6))
# Create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
  1. XGBoost CLASSIFIER PCA


In [256]:
# Create XGBoost Classifier
xgb_pca = XGBClassifier(random_state=42)

# Fit the model to the training data
xgb_pca.fit(X_train_pca, y_train)

# Predict the target variable for test data
y_pred_xgb_pca = xgb_pca.predict(X_test_pca)

xgb_pca_metrics = classification_report(y_test, y_pred_xgb_pca)
# evaluate model performance on test data
print(xgb_pca_metrics)
              precision    recall  f1-score   support

           0       0.87      0.98      0.93       247
           1       0.75      0.26      0.38        47

    accuracy                           0.87       294
   macro avg       0.81      0.62      0.65       294
weighted avg       0.85      0.87      0.84       294

In [257]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

# Create AdaBoost Classifier
ada_pca = AdaBoostClassifier(random_state=42, n_estimators = 300)

# Fit the model to the training data
ada_pca.fit(X_train_pca, y_train)

# Predict the target variable for test data
y_pred_ada_pca = ada_pca.predict(X_test_pca)

ada_pca_metrics = classification_report(y_test, y_pred_ada_pca)
# Evaluate model performance on test data
print(ada_pca_metrics)
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       247
           1       0.47      0.38      0.42        47

    accuracy                           0.83       294
   macro avg       0.68      0.65      0.66       294
weighted avg       0.82      0.83      0.83       294

CLUSTERING



1.K-MEANS

In [258]:
# Separating independent and dependent attributes
X1 = new_df.drop(['Attrition'], axis = 1)
In [259]:
X1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   BusinessTravel            1470 non-null   object
 2   DailyRate                 1470 non-null   int64 
 3   Department                1470 non-null   object
 4   DistanceFromHome          1470 non-null   int64 
 5   Education                 1470 non-null   int64 
 6   EducationField            1470 non-null   object
 7   EmployeeCount             1470 non-null   int64 
 8   EmployeeNumber            1470 non-null   int64 
 9   EnvironmentSatisfaction   1470 non-null   int64 
 10  Gender                    1470 non-null   object
 11  HourlyRate                1470 non-null   int64 
 12  JobInvolvement            1470 non-null   int64 
 13  JobLevel                  1470 non-null   int64 
 14  JobRole                   1470 non-null   object
 15  JobSatisfaction           1470 non-null   int64 
 16  MaritalStatus             1470 non-null   object
 17  MonthlyIncome             1470 non-null   int64 
 18  MonthlyRate               1470 non-null   int64 
 19  NumCompaniesWorked        1470 non-null   int64 
 20  Over18                    1470 non-null   object
 21  OverTime                  1470 non-null   object
 22  PercentSalaryHike         1470 non-null   int64 
 23  PerformanceRating         1470 non-null   int64 
 24  RelationshipSatisfaction  1470 non-null   int64 
 25  StandardHours             1470 non-null   int64 
 26  StockOptionLevel          1470 non-null   int64 
 27  TotalWorkingYears         1470 non-null   int64 
 28  TrainingTimesLastYear     1470 non-null   int64 
 29  WorkLifeBalance           1470 non-null   int64 
 30  YearsAtCompany            1470 non-null   int64 
 31  YearsInCurrentRole        1470 non-null   int64 
 32  YearsSinceLastPromotion   1470 non-null   int64 
 33  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(8)
memory usage: 390.6+ KB
In [260]:
# create a StandardScaler object
scaler = StandardScaler()

# fit and transform the numerical attributes
X1[num_at_scale] = scaler.fit_transform(X1[num_at_scale])
In [261]:
X1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1470 non-null   float64
 1   BusinessTravel            1470 non-null   object 
 2   DailyRate                 1470 non-null   float64
 3   Department                1470 non-null   object 
 4   DistanceFromHome          1470 non-null   float64
 5   Education                 1470 non-null   int64  
 6   EducationField            1470 non-null   object 
 7   EmployeeCount             1470 non-null   int64  
 8   EmployeeNumber            1470 non-null   int64  
 9   EnvironmentSatisfaction   1470 non-null   int64  
 10  Gender                    1470 non-null   object 
 11  HourlyRate                1470 non-null   float64
 12  JobInvolvement            1470 non-null   int64  
 13  JobLevel                  1470 non-null   int64  
 14  JobRole                   1470 non-null   object 
 15  JobSatisfaction           1470 non-null   int64  
 16  MaritalStatus             1470 non-null   object 
 17  MonthlyIncome             1470 non-null   float64
 18  MonthlyRate               1470 non-null   float64
 19  NumCompaniesWorked        1470 non-null   float64
 20  Over18                    1470 non-null   object 
 21  OverTime                  1470 non-null   object 
 22  PercentSalaryHike         1470 non-null   float64
 23  PerformanceRating         1470 non-null   int64  
 24  RelationshipSatisfaction  1470 non-null   int64  
 25  StandardHours             1470 non-null   int64  
 26  StockOptionLevel          1470 non-null   int64  
 27  TotalWorkingYears         1470 non-null   float64
 28  TrainingTimesLastYear     1470 non-null   int64  
 29  WorkLifeBalance           1470 non-null   int64  
 30  YearsAtCompany            1470 non-null   float64
 31  YearsInCurrentRole        1470 non-null   float64
 32  YearsSinceLastPromotion   1470 non-null   float64
 33  YearsWithCurrManager      1470 non-null   float64
dtypes: float64(13), int64(13), object(8)
memory usage: 390.6+ KB
In [262]:
# Label encoding the categorical attributes
le_dict = {}

for col in cat_cols:
    le = LabelEncoder()
    X1[col] = le.fit_transform(X1[col])
    le_dict[col] = le
In [263]:
X1 = X1.drop('Over18', axis =1)
In [264]:
X1.head()
Out[264]:
Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 0.446350 2 0.742527 2 -1.010909 1 1 1 1 1 ... 0 80 0 -0.421642 0 0 -0.164613 -0.063296 -0.679146 0.245834
1 1.322365 1 -1.297775 1 -0.147150 0 1 1 2 2 ... 3 80 1 -0.164511 3 2 0.488508 0.764998 -0.368715 0.806541
2 0.008343 2 1.414363 1 -0.887515 1 4 1 4 3 ... 1 80 0 -0.550208 3 2 -1.144294 -1.167687 -0.679146 -1.155935
3 -0.429664 1 1.461466 1 -0.764121 3 1 1 5 3 ... 2 80 0 -0.421642 3 2 0.161947 0.764998 0.252146 -1.155935
4 -1.086676 2 -0.524295 1 -0.887515 0 3 1 7 0 ... 3 80 1 -0.678774 3 2 -0.817734 -0.615492 -0.058285 -0.595227

5 rows × 33 columns

In [265]:
new_df['TotalWorkingYears'].describe()
Out[265]:
count    1470.000000
mean       11.279592
std         7.780782
min         0.000000
25%         6.000000
50%        10.000000
75%        15.000000
max        40.000000
Name: TotalWorkingYears, dtype: float64
In [266]:
# Calculate correlations with the target variable
correlations = df_one_hot.corr()['Attrition'].sort_values(ascending=False)
# Set max_rows to None to display the entire list
pd.set_option('display.max_rows', None)
# Print correlations
print(correlations)
Attrition                            1.000000
OverTime_Yes                         0.246118
JobLevel_1                           0.212551
StockOptionLevel_0                   0.195342
MaritalStatus_Single                 0.175419
JobRole_Sales Representative         0.157234
EnvironmentSatisfaction_1            0.122819
JobInvolvement_1                     0.117161
BusinessTravel_Travel_Frequently     0.115143
WorkLifeBalance_1                    0.098689
JobRole_Laboratory Technician        0.098290
JobSatisfaction_1                    0.090329
Department_Sales                     0.080855
DistanceFromHome                     0.077924
EducationField_Technical Degree      0.069355
TrainingTimesLastYear_0              0.061894
RelationshipSatisfaction_1           0.059222
EducationField_Marketing             0.055781
JobInvolvement_2                     0.044731
NumCompaniesWorked                   0.043494
TrainingTimesLastYear_4              0.041216
TrainingTimesLastYear_2              0.037544
EducationField_Human Resources       0.036466
JobRole_Human Resources              0.036215
Gender_Male                          0.029453
Education_3                          0.025723
Education_1                          0.020777
JobRole_Sales Executive              0.019774
Department_Human Resources           0.016832
MonthlyRate                          0.015170
WorkLifeBalance_4                    0.014131
WorkLifeBalance_2                    0.011093
StockOptionLevel_3                   0.010271
JobSatisfaction_3                    0.007015
JobSatisfaction_2                    0.004038
PerformanceRating_4                  0.002889
JobRole_Research Scientist          -0.000360
PerformanceRating_3                 -0.002889
HourlyRate                          -0.006846
Education_2                         -0.006884
RelationshipSatisfaction_3          -0.011984
PercentSalaryHike                   -0.013478
EnvironmentSatisfaction_2           -0.015267
JobLevel_3                          -0.016380
RelationshipSatisfaction_2          -0.017611
EducationField_Other                -0.017898
TrainingTimesLastYear_1             -0.021113
RelationshipSatisfaction_4          -0.022940
Education_4                         -0.025676
Education_5                         -0.028507
Gender_Female                       -0.029453
EducationField_Life Sciences        -0.032703
YearsSinceLastPromotion             -0.033019
TrainingTimesLastYear_5             -0.035170
TrainingTimesLastYear_3             -0.039854
TrainingTimesLastYear_6             -0.040309
EnvironmentSatisfaction_3           -0.044209
EducationField_Medical              -0.046999
EnvironmentSatisfaction_4           -0.047909
BusinessTravel_Travel_Rarely        -0.049538
JobLevel_5                          -0.053566
JobInvolvement_3                    -0.056213
DailyRate                           -0.056652
JobInvolvement_4                    -0.063577
WorkLifeBalance_3                   -0.064301
BusinessTravel_Non-Travel           -0.074457
JobRole_Healthcare Representative   -0.078696
StockOptionLevel_2                  -0.080472
JobRole_Manufacturing Director      -0.082994
JobRole_Manager                     -0.083316
Department_Research & Development   -0.085293
JobLevel_4                          -0.086461
MaritalStatus_Divorced              -0.087716
JobSatisfaction_4                   -0.087830
JobRole_Research Director           -0.088870
MaritalStatus_Married               -0.090984
JobLevel_2                          -0.131138
YearsAtCompany                      -0.134392
StockOptionLevel_1                  -0.151049
YearsWithCurrManager                -0.156199
Age                                 -0.159205
MonthlyIncome                       -0.159840
YearsInCurrentRole                  -0.160545
TotalWorkingYears                   -0.171063
OverTime_No                         -0.246118
Name: Attrition, dtype: float64
In [267]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering


# Select the top correlated features (e.g., top 10)
top_features = correlations[1:].index.tolist()#correlations[1:11]


# Subset the data using the top correlated features
X = df_one_hot[top_features].values

# Perform hierarchical clustering with the correlation matrix as the distance metric
corr_matrix = np.corrcoef(X, rowvar=False)
clustering = AgglomerativeClustering(n_clusters=2, affinity='precomputed', linkage='complete').fit(corr_matrix)

# Print the cluster labels for the features
print(clustering.labels_)
[0 1 1 0 0 0 0 1 1 0 1 1 0 0 1 1 0 0 1 1 1 0 0 1 1 1 0 1 0 1 1 1 1 1 0 0 0
 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 0 0 1 0
 0 0 1 1 0 1 1 0 0 0]
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_agglomerative.py:983: FutureWarning: Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4. Use `metric` instead
  warnings.warn(
In [268]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering


# Select the top and bottom correlated features (e.g., top 10 and bottom 10)
top_features = correlations[1:11].index.tolist()
# bottom_features = correlations[-10:].index.tolist()
# selected_features = top_features + bottom_features

# Subset the data using the selected features
X = df_one_hot[top_features].values

# Perform hierarchical clustering with the correlation matrix as the distance metric
corr_matrix = np.corrcoef(X, rowvar=False)
clustering = AgglomerativeClustering(n_clusters=2).fit(corr_matrix)

# Print the cluster labels for the features
print(clustering.labels_)
[0 0 1 1 0 0 0 0 0 0]
In [269]:
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

# Compute the linkage matrix
Z = sch.linkage(corr_matrix, method='complete')

print(Z)

print(top_features)
[[ 2.          3.          0.30200354  2.        ]
 [ 1.          9.          0.95978862  2.        ]
 [ 5.          8.          1.36870849  2.        ]
 [ 4.          7.          1.3721126   2.        ]
 [ 0.          6.          1.40527624  2.        ]
 [13.         14.          1.4406976   4.        ]
 [11.         12.          1.48986331  4.        ]
 [15.         16.          1.58494054  8.        ]
 [10.         17.          1.66542273 10.        ]]
['OverTime_Yes', 'JobLevel_1', 'StockOptionLevel_0', 'MaritalStatus_Single', 'JobRole_Sales Representative', 'EnvironmentSatisfaction_1', 'JobInvolvement_1', 'BusinessTravel_Travel_Frequently', 'WorkLifeBalance_1', 'JobRole_Laboratory Technician']
In [270]:
# Create a dendrogram
plt.figure(figsize=(10, 10))
sch.dendrogram(Z, labels=top_features)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Distance')
plt.title('Dendrogram')
plt.show()
In [271]:
import seaborn as sns

# Calculate correlations with the target variable
correlations = df_one_hot.corr()['Attrition'].sort_values(ascending=False)

# Create a correlation matrix of all features
corr_matrix = df_one_hot.corr()

# Plot a heatmap of the correlations with the target variable
sns.set(rc={"figure.figsize":(20, 15)})
sns.heatmap(corr_matrix.loc[:, ['Attrition']], annot=True, cmap='coolwarm')
Out[271]:
<Axes: >
In [138]:
attribute_list = ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate',
                  'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
                  'PercentSalaryHike', 'TotalWorkingYears', 'YearsAtCompany',
                  'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

# Create a list of all pairs of columns to test
pairs = list(itertools.combinations(attribute_list, 2))

# Initialize variables to hold the best scores and attribute pairs
best_silhouette_pairs_std = {'kmeans': None, 'agg': None, 'gmm': None}
best_silhouette_scores_std = {'kmeans': -1, 'agg': -1, 'gmm': -1}
best_calinski_harabasz_pairs_std = {'kmeans': None, 'agg': None, 'gmm': None}
best_calinski_harabasz_scores_std = {'kmeans': -1, 'agg': -1, 'gmm': -1}
best_davies_bouldin_pairs_std = {'kmeans': None, 'agg': None, 'gmm': None}
best_davies_bouldin_scores_std = {'kmeans': float("inf"), 'agg': float("inf"), 'gmm': float("inf")}

# Loop through all pairs and calculate scores
for pair in pairs:
    X = df_one_hot[list(pair)]

    # KMeans Clustering
    kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
    kmeans_labels = kmeans.labels_

    # Agglomerative Clustering
    agg = AgglomerativeClustering(n_clusters=2).fit(X)
    agg_labels = agg.labels_

    # GMM Clustering
    gmm = GaussianMixture(n_components=2, random_state=0).fit(X)
    gmm_labels = gmm.predict(X)

    # Calculate scores for each algorithm
    silhouette_kmeans = silhouette_score(X, kmeans_labels)
    silhouette_agg = silhouette_score(X, agg_labels)
    silhouette_gmm = silhouette_score(X, gmm_labels)

    calinski_harabasz_kmeans = calinski_harabasz_score(X, kmeans_labels)
    calinski_harabasz_agg = calinski_harabasz_score(X, agg_labels)
    calinski_harabasz_gmm = calinski_harabasz_score(X, gmm_labels)

    davies_bouldin_kmeans = davies_bouldin_score(X, kmeans_labels)
    davies_bouldin_agg = davies_bouldin_score(X, agg_labels)
    davies_bouldin_gmm = davies_bouldin_score(X, gmm_labels)

    # Update best scores and attribute pairs for each algorithm
    if silhouette_kmeans > best_silhouette_scores_std['kmeans']:
        best_silhouette_scores_std['kmeans'] = silhouette_kmeans
        best_silhouette_pairs_std['kmeans'] = pair

    if calinski_harabasz_kmeans > best_calinski_harabasz_scores_std['kmeans']:
        best_calinski_harabasz_scores_std['kmeans'] = calinski_harabasz_kmeans
        best_calinski_harabasz_pairs_std['kmeans'] = pair

       # Update best scores and attribute pairs for KMeans algorithm
    if davies_bouldin_kmeans < best_davies_bouldin_scores_std['kmeans']:
        best_davies_bouldin_scores_std['kmeans'] = davies_bouldin_kmeans
        best_davies_bouldin_pairs_std['kmeans'] = pair

    # Update best scores and attribute pairs for Agglomerative Clustering algorithm
    if silhouette_agg > best_silhouette_scores_std['agg']:
        best_silhouette_scores_std['agg'] = silhouette_agg
        best_silhouette_pairs_std['agg'] = pair

    if calinski_harabasz_agg > best_calinski_harabasz_scores_std['agg']:
        best_calinski_harabasz_scores_std['agg'] = calinski_harabasz_agg
        best_calinski_harabasz_pairs_std['agg'] = pair

    if davies_bouldin_agg < best_davies_bouldin_scores_std['agg']:
        best_davies_bouldin_scores_std['agg'] = davies_bouldin_agg
        best_davies_bouldin_pairs_std['agg'] = pair

    # Update best scores and attribute pairs for GMM algorithm
    if silhouette_gmm > best_silhouette_scores_std['gmm']:
        best_silhouette_scores_std['gmm'] = silhouette_gmm
        best_silhouette_pairs_std['gmm'] = pair

    if calinski_harabasz_gmm > best_calinski_harabasz_scores_std['gmm']:
        best_calinski_harabasz_scores_std['gmm'] = calinski_harabasz_gmm
        best_calinski_harabasz_pairs_std['gmm'] = pair

    if davies_bouldin_gmm < best_davies_bouldin_scores_std['gmm']:
        best_davies_bouldin_scores_std['gmm'] = davies_bouldin_gmm
        best_davies_bouldin_pairs_std['gmm'] = pair

print("Best Silhouette Scores:")
print("KMeans: ", best_silhouette_scores_std['kmeans'], "for attributes", best_silhouette_pairs_std['kmeans'])
print("Agglomerative Clustering: ", best_silhouette_scores_std['agg'], "for attributes", best_silhouette_pairs_std['agg'])
print("GMM: ", best_silhouette_scores_std['gmm'], "for attributes", best_silhouette_pairs_std['gmm'])

print("Best Calinski-Harabasz Scores:")
print("KMeans: ", best_calinski_harabasz_scores_std['kmeans'], "for attributes", best_calinski_harabasz_pairs_std['kmeans'])
print("Agglomerative Clustering: ", best_calinski_harabasz_scores_std['agg'], "for attributes", best_calinski_harabasz_pairs_std['agg'])
print("GMM: ", best_calinski_harabasz_scores_std['gmm'], "for attributes", best_calinski_harabasz_pairs_std['gmm'])

print("Best Davies-Bouldin Scores:")
print("KMeans: ", best_davies_bouldin_scores_std['kmeans'], "for attributes", best_davies_bouldin_pairs_std['kmeans'])
print("Agglomerative Clustering: ", best_davies_bouldin_scores_std['agg'], "for attributes", best_davies_bouldin_pairs_std['agg'])
print("GMM: ", best_davies_bouldin_scores_std['gmm'], "for attributes", best_davies_bouldin_pairs_std['gmm'])
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
Best Silhouette Scores:
KMeans:  0.6445442360370812 for attributes ('MonthlyIncome', 'TotalWorkingYears')
Agglomerative Clustering:  0.6361364940522174 for attributes ('MonthlyIncome', 'TotalWorkingYears')
GMM:  0.5818319728083505 for attributes ('YearsAtCompany', 'YearsSinceLastPromotion')
Best Calinski-Harabasz Scores:
KMeans:  2819.3834013096302 for attributes ('YearsInCurrentRole', 'YearsWithCurrManager')
Agglomerative Clustering:  2640.545519228518 for attributes ('YearsInCurrentRole', 'YearsWithCurrManager')
GMM:  1729.0998680358625 for attributes ('YearsAtCompany', 'YearsSinceLastPromotion')
Best Davies-Bouldin Scores:
KMeans:  0.5636413477812293 for attributes ('YearsInCurrentRole', 'YearsWithCurrManager')
Agglomerative Clustering:  0.4558193050321143 for attributes ('MonthlyIncome', 'TotalWorkingYears')
GMM:  0.7486272747474666 for attributes ('YearsAtCompany', 'YearsInCurrentRole')

BEST SILHOUETTE SCORES (std num attributes)



K-MEANS

In [272]:
# Ground truth labels
y_true = df_one_hot['Attrition']

# Predicted cluster labels
pair = best_silhouette_pairs_std['kmeans']
X = df_one_hot[list(pair)]
kmeans = KMeans(n_clusters=2, random_state=42).fit(X)
labels = kmeans.labels_

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

AGGLOMERATIVE

In [273]:
# Ground truth labels
y_true = df_one_hot['Attrition']

# Predicted cluster labels
pair = best_silhouette_pairs_std['agg']
X = df_one_hot[list(pair)]
agg = AgglomerativeClustering(n_clusters=2)
labels = agg.fit_predict(X)

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()

GMM

In [274]:
# Ground truth labels
y_true = df_one_hot['Attrition']

# Predicted cluster labels
pair=best_silhouette_pairs_std['gmm'] 
X = df_one_hot[list(pair)]
gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
gmm.fit(X)
labels = gmm.predict(X)

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()

BEST CALINSKI-HARABASZ SCORES (std num attributes)



K-MEANS

In [275]:
# Ground truth labels
y_true = df_one_hot['Attrition']

# Predicted cluster labels
pair = best_calinski_harabasz_pairs_std['kmeans']
X = df_one_hot[list(pair)]
kmeans = KMeans(n_clusters=2, random_state=42).fit(X)
labels = kmeans.labels_

# Calculate accuracy
y_pred = (labels == 1).astype(int)  # Assign the larger cluster to 1 and the smaller to 0
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, y_pred)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

AGGLOMERATIVE

In [276]:
# Ground truth labels
y_true = df_one_hot['Attrition']

# Predicted cluster labels
pair = best_calinski_harabasz_pairs_std['agg']
X = df_one_hot[list(pair)]
agg = AgglomerativeClustering(n_clusters=2, linkage='average')
labels = agg.fit_predict(X)

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()

GMM

In [277]:
# Ground truth labels
y_true = df_one_hot['Attrition']

# Predicted cluster labels
pair = best_calinski_harabasz_pairs_std['gmm']
X = df_one_hot[list(pair)]
gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
gmm.fit(X)
labels = gmm.predict(X)

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()

BEST DAVIES-BOULDIN SCORES (std num attributes)



K-MEANS

In [278]:
# Ground truth labels
y_true = df_one_hot['Attrition']

# Predicted cluster labels
pair = best_davies_bouldin_pairs_std['kmeans']
X = df_one_hot[list(pair)]
kmeans = KMeans(n_clusters=2, random_state=42).fit(X)
labels = kmeans.labels_

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

AGGLOMERATIVE

In [279]:
# Ground truth labels
y_true = df_one_hot['Attrition']

# Predicted cluster labels
pair = best_davies_bouldin_pairs_std['agg']
X = df_one_hot[list(pair)]
agg = AgglomerativeClustering(n_clusters=2)
labels = agg.fit_predict(X)

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()
In [280]:
attribute_list = ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate',
                  'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
                  'PercentSalaryHike', 'TotalWorkingYears', 'YearsAtCompany',
                  'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

# Create a list of all pairs of columns to test
pairs = list(itertools.combinations(attribute_list, 2))

# Initialize variables to hold the best scores and attribute pairs
best_silhouette_pairs = {'kmeans': None, 'agg': None, 'gmm': None}
best_silhouette_scores = {'kmeans': -1, 'agg': -1, 'gmm': -1}
best_calinski_harabasz_pairs = {'kmeans': None, 'agg': None, 'gmm': None}
best_calinski_harabasz_scores = {'kmeans': -1, 'agg': -1, 'gmm': -1}
best_davies_bouldin_pairs = {'kmeans': None, 'agg': None, 'gmm': None}
best_davies_bouldin_scores = {'kmeans': float("inf"), 'agg': float("inf"), 'gmm': float("inf")}

# Loop through all pairs and calculate scores
for pair in pairs:
    X = new_df[list(pair)]#df_one_hot

    # KMeans Clustering
    kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
    kmeans_labels = kmeans.labels_

    # Agglomerative Clustering
    agg = AgglomerativeClustering(n_clusters=2).fit(X)
    agg_labels = agg.labels_

    # GMM Clustering
    gmm = GaussianMixture(n_components=2, random_state=0).fit(X)
    gmm_labels = gmm.predict(X)

    # Calculate scores for each algorithm
    silhouette_kmeans = silhouette_score(X, kmeans_labels)
    silhouette_agg = silhouette_score(X, agg_labels)
    silhouette_gmm = silhouette_score(X, gmm_labels)

    calinski_harabasz_kmeans = calinski_harabasz_score(X, kmeans_labels)
    calinski_harabasz_agg = calinski_harabasz_score(X, agg_labels)
    calinski_harabasz_gmm = calinski_harabasz_score(X, gmm_labels)

    davies_bouldin_kmeans = davies_bouldin_score(X, kmeans_labels)
    davies_bouldin_agg = davies_bouldin_score(X, agg_labels)
    davies_bouldin_gmm = davies_bouldin_score(X, gmm_labels)

    # Update best scores and attribute pairs for each algorithm
    if silhouette_kmeans > best_silhouette_scores['kmeans']:
        best_silhouette_scores['kmeans'] = silhouette_kmeans
        best_silhouette_pairs['kmeans'] = pair

    if calinski_harabasz_kmeans > best_calinski_harabasz_scores['kmeans']:
        best_calinski_harabasz_scores['kmeans'] = calinski_harabasz_kmeans
        best_calinski_harabasz_pairs['kmeans'] = pair

       # Update best scores and attribute pairs for KMeans algorithm
    if davies_bouldin_kmeans < best_davies_bouldin_scores['kmeans']:
        best_davies_bouldin_scores['kmeans'] = davies_bouldin_kmeans
        best_davies_bouldin_pairs['kmeans'] = pair

    # Update best scores and attribute pairs for Agglomerative Clustering algorithm
    if silhouette_agg > best_silhouette_scores['agg']:
        best_silhouette_scores['agg'] = silhouette_agg
        best_silhouette_pairs['agg'] = pair

    if calinski_harabasz_agg > best_calinski_harabasz_scores['agg']:
        best_calinski_harabasz_scores['agg'] = calinski_harabasz_agg
        best_calinski_harabasz_pairs['agg'] = pair

    if davies_bouldin_agg < best_davies_bouldin_scores['agg']:
        best_davies_bouldin_scores['agg'] = davies_bouldin_agg
        best_davies_bouldin_pairs['agg'] = pair

    # Update best scores and attribute pairs for GMM algorithm
    if silhouette_gmm > best_silhouette_scores['gmm']:
        best_silhouette_scores['gmm'] = silhouette_gmm
        best_silhouette_pairs['gmm'] = pair

    if calinski_harabasz_gmm > best_calinski_harabasz_scores['gmm']:
        best_calinski_harabasz_scores['gmm'] = calinski_harabasz_gmm
        best_calinski_harabasz_pairs['gmm'] = pair

    if davies_bouldin_gmm < best_davies_bouldin_scores['gmm']:
        best_davies_bouldin_scores['gmm'] = davies_bouldin_gmm
        best_davies_bouldin_pairs['gmm'] = pair

print("Best Silhouette Scores:")
print("KMeans: ", best_silhouette_scores['kmeans'], "for attributes", best_silhouette_pairs['kmeans'])
print("Agglomerative Clustering: ", best_silhouette_scores['agg'], "for attributes", best_silhouette_pairs['agg'])
print("GMM: ", best_silhouette_scores['gmm'], "for attributes", best_silhouette_pairs['gmm'])

print("Best Calinski-Harabasz Scores:")
print("KMeans: ", best_calinski_harabasz_scores['kmeans'], "for attributes", best_calinski_harabasz_pairs['kmeans'])
print("Agglomerative Clustering: ", best_calinski_harabasz_scores['agg'], "for attributes", best_calinski_harabasz_pairs['agg'])
print("GMM: ", best_calinski_harabasz_scores['gmm'], "for attributes", best_calinski_harabasz_pairs['gmm'])

print("Best Davies-Bouldin Scores:")
print("KMeans: ", best_davies_bouldin_scores['kmeans'], "for attributes", best_davies_bouldin_pairs['kmeans'])
print("Agglomerative Clustering: ", best_davies_bouldin_scores['agg'], "for attributes", best_davies_bouldin_pairs['agg'])
print("GMM: ", best_davies_bouldin_scores['gmm'], "for attributes", best_davies_bouldin_pairs['gmm'])
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
Best Silhouette Scores:
KMeans:  0.7079814363371862 for attributes ('MonthlyIncome', 'NumCompaniesWorked')
Agglomerative Clustering:  0.7079810621919209 for attributes ('MonthlyIncome', 'YearsSinceLastPromotion')
GMM:  0.6816849210008012 for attributes ('MonthlyIncome', 'PercentSalaryHike')
Best Calinski-Harabasz Scores:
KMeans:  4557.699584814114 for attributes ('MonthlyRate', 'NumCompaniesWorked')
Agglomerative Clustering:  4459.609326670578 for attributes ('DailyRate', 'YearsWithCurrManager')
GMM:  4556.823061032582 for attributes ('MonthlyRate', 'YearsWithCurrManager')
Best Davies-Bouldin Scores:
KMeans:  0.4894599369026216 for attributes ('MonthlyRate', 'NumCompaniesWorked')
Agglomerative Clustering:  0.48415081584694925 for attributes ('DailyRate', 'YearsAtCompany')
GMM:  0.48958663466549857 for attributes ('MonthlyRate', 'YearsWithCurrManager')

BEST SILHOUETTE SCORES



K-MEANS

In [281]:
# Ground truth labels
y_true = (new_df['Attrition'] == 'Yes').astype(int)  # Convert 'Yes' to 1, 'No' to 0

# Predicted cluster labels
pair = best_silhouette_pairs['kmeans']
X = new_df[list(pair)]
kmeans = KMeans(n_clusters=2, random_state=42).fit(X)
labels = kmeans.labels_

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

AGGLOMERATIVE

In [282]:
# Ground truth labels
y_true = (new_df['Attrition'] == 'Yes').astype(int)  # Convert 'Yes' to 1, 'No' to 0

# Predicted cluster labels
pair=best_silhouette_pairs['agg']
X = new_df[list(pair)]
agg = AgglomerativeClustering(n_clusters=2, linkage = 'single')
labels = agg.fit_predict(X)

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()

GMM

In [283]:
# Ground truth labels
y_true = (new_df['Attrition'] == 'Yes').astype(int)  # Convert 'Yes' to 1, 'No' to 0

# Predicted cluster labels
pair=best_silhouette_pairs['gmm'] 
X = new_df[list(pair)]
gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
gmm.fit(X)
labels = gmm.predict(X)

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()

BEST CALINSKI-HARABASZ SCORES



K-MEANS

In [284]:
# Ground truth labels
y_true = (new_df['Attrition'] == 'Yes').astype(int)  # Convert 'Yes' to 1, 'No' to 0

# Predicted cluster labels
pair=best_calinski_harabasz_pairs['kmeans']
X = new_df[list(pair)]
kmeans = KMeans(n_clusters=2, random_state=42).fit(X)
labels = kmeans.labels_

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

AGGLOMERATIVE

In [285]:
# Ground truth labels
y_true = (new_df['Attrition'] == 'Yes').astype(int)  # Convert 'Yes' to 1, 'No' to 0

# Predicted cluster labels
pair=best_calinski_harabasz_pairs['agg']
X = new_df[list(pair)]
agg = AgglomerativeClustering(n_clusters=2, linkage = 'average')
labels = agg.fit_predict(X)

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()

GMM

In [286]:
# Ground truth labels
y_true = (new_df['Attrition'] == 'Yes').astype(int)  # Convert 'Yes' to 1, 'No' to 0

# Predicted cluster labels
pair=best_calinski_harabasz_pairs['gmm']
X = new_df[list(pair)]
gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
gmm.fit(X)
labels = gmm.predict(X)

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()

BEST DAVIES-BOULDIN SCORES



K-MEANS: SAME AS PREVIOUS

AGGLOMERATIVE

In [287]:
# Ground truth labels
y_true = (new_df['Attrition'] == 'Yes').astype(int)  # Convert 'Yes' to 1, 'No' to 0

# Predicted cluster labels
pair=best_davies_bouldin_pairs['agg']
X = new_df[list(pair)]
agg = AgglomerativeClustering(n_clusters=2 ) #linkage = 'ward'
labels = agg.fit_predict(X)

# Calculate accuracy
y_pred = labels  # Use predicted cluster labels as predicted classes
accuracy = accuracy_score(y_true, y_pred)

# Calculate F1 score
f1 = f1_score(y_true, labels)

# Plot the clusters
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))

# Plot predicted clusters
ax1.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
ax1.set_xlabel(pair[0])
ax1.set_ylabel(pair[1])
ax1.set_title(f'Predicted Clusters (Accuracy: {accuracy:.2f}, F1 Score: {f1:.2f})')

# Plot true clusters
scatter = ax2.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y_true, cmap='viridis')
legend = ax2.legend(*scatter.legend_elements(),
                    loc="lower left", title="Attrition")
ax2.add_artist(legend)
ax2.set_xlabel(pair[0])
ax2.set_ylabel(pair[1])
ax2.set_title('Ground Truth')

plt.show()

GMM: SAME AS ABOVE

NEURAL NETWORK



In [288]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score, recall_score

# Define the neural network architecture
model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define the EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)

# Train the model with the EarlyStopping callback
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

# Compute the F1 score and recall
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f'Test F1 score: {f1:.4f}')
print(f'Test recall: {recall:.4f}')
Epoch 1/10
37/37 [==============================] - 1s 11ms/step - loss: 0.6880 - accuracy: 0.5799 - val_loss: 0.4771 - val_accuracy: 0.8401
Epoch 2/10
37/37 [==============================] - 0s 4ms/step - loss: 0.4501 - accuracy: 0.8367 - val_loss: 0.3909 - val_accuracy: 0.8401
Epoch 3/10
37/37 [==============================] - 0s 3ms/step - loss: 0.3870 - accuracy: 0.8384 - val_loss: 0.3675 - val_accuracy: 0.8401
Epoch 4/10
37/37 [==============================] - 0s 5ms/step - loss: 0.3528 - accuracy: 0.8478 - val_loss: 0.3482 - val_accuracy: 0.8605
Epoch 5/10
37/37 [==============================] - 0s 5ms/step - loss: 0.3226 - accuracy: 0.8656 - val_loss: 0.3340 - val_accuracy: 0.8605
Epoch 6/10
37/37 [==============================] - 0s 5ms/step - loss: 0.2980 - accuracy: 0.8869 - val_loss: 0.3183 - val_accuracy: 0.8571
Epoch 7/10
37/37 [==============================] - 0s 6ms/step - loss: 0.2782 - accuracy: 0.8988 - val_loss: 0.3084 - val_accuracy: 0.8639
Epoch 8/10
37/37 [==============================] - 0s 4ms/step - loss: 0.2643 - accuracy: 0.9048 - val_loss: 0.3013 - val_accuracy: 0.8707
Epoch 9/10
37/37 [==============================] - 0s 6ms/step - loss: 0.2523 - accuracy: 0.9150 - val_loss: 0.2976 - val_accuracy: 0.8776
Epoch 10/10
37/37 [==============================] - 0s 4ms/step - loss: 0.2429 - accuracy: 0.9133 - val_loss: 0.3003 - val_accuracy: 0.8776
10/10 [==============================] - 0s 3ms/step - loss: 0.3003 - accuracy: 0.8776
Test loss: 0.3003
Test accuracy: 0.8776
10/10 [==============================] - 0s 2ms/step
Test F1 score: 0.5263
Test recall: 0.4255
In [289]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Get the predicted probabilities for the positive class
y_pred_prob = model.predict(X_test)

# Compute the false positive rate (fpr), true positive rate (tpr), and threshold
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Compute the area under the ROC curve (AUC)
roc_auc = auc(fpr, tpr)

# Set the plot size as 10 by 10
plt.figure(figsize=(10, 10))
# set the figure size
plt.figure(figsize=(8, 6))
# Plot the ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
10/10 [==============================] - 0s 2ms/step
<Figure size 1000x1000 with 0 Axes>
In [290]:
import pandas as pd
from sklearn.metrics import classification_report

# Store the classification reports in a dictionary
class_reports = {
    'lr_smote': lr_smote_metrics,
    'dt_smote': dt_smote_metrics,
    'rf_smote': rf_smote_metrics,
    'gb_smote': gb_smote_metrics,
    'xgb_smote': xgb_smote_metrics,
    'lr_pca_smote': lr_pca_smote_metrics,
    'dt_pca_smote': dt_pca_smote_metrics,
    'rf_pca_smote': rf_pca_smote_metrics,
    'gb_pca_smote': gb_pca_smote_metrics,
    'xgb_pca_smote': xgb_pca_smote_metrics,
    'lr': lr_metrics,
    'dt': dt_metrics,
    'rf': rf_metrics,
    'gb': gb_metrics,
    'xgb': xgb_metrics,
    'lr_pca': lr_pca_metrics,
    'dt_pca': dt_pca_metrics,
    'rf_pca': rf_pca_metrics,
    'gb_pca': gb_pca_metrics,
    'xgb_pca': xgb_pca_metrics,
    'neural_net': classification_report(y_test, y_pred)
}

# Create an empty dataframe
comp_df = pd.DataFrame(columns=['model', 'class_label', 'precision', 'recall', 'f1-score', 'support'])

# Iterate through the dictionary and append the metrics to the dataframe
for model, report in class_reports.items():
    # Split the report by line breaks to get individual lines
    lines = report.split('\n')

    # Extract the metrics for each class
    class_metrics = lines[2:-5]  # Exclude the last two lines that contain averages

    for metric_line in class_metrics:
        # Split each line by whitespace and extract the desired metrics
        metric_values = metric_line.split()
        class_label = int(metric_values[0])
        precision = float(metric_values[1])
        recall = float(metric_values[2])
        f1_score = float(metric_values[3])
        support = int(metric_values[4])

        # Append the metrics to the dataframe
        row = pd.DataFrame({
            'model': [model],
            'class_label': [class_label],
            'precision': [precision],
            'recall': [recall],
            'f1-score': [f1_score],
            'support': [support]
        })
        comp_df = pd.concat([comp_df, row], ignore_index=True)

# Pivot the dataframe to have separate columns for class_label = 0 and class_label = 1
comp_df_pivot = comp_df.pivot(index='model', columns='class_label')

# Rename the columns to include the metric names
comp_df_pivot.columns = [f"{col[0]}_{col[1]}" for col in comp_df_pivot.columns]

# Reset the index
comp_df_pivot.reset_index(inplace=True)
In [295]:
sorted_df = comp_df_pivot.sort_values(by='precision_1', ascending=False)
sorted_df.head(20)
Out[295]:
model precision_0 precision_1 recall_0 recall_1 f1-score_0 f1-score_1 support_0 support_1
18 xgb_pca 0.87 0.75 0.98 0.26 0.93 0.38 247 47
8 lr 0.90 0.71 0.96 0.47 0.93 0.56 247 47
9 lr_pca 0.90 0.71 0.96 0.47 0.93 0.56 247 47
13 rf 0.85 0.71 0.99 0.11 0.92 0.19 247 47
11 lr_smote 0.91 0.70 0.96 0.49 0.93 0.57 247 47
12 neural_net 0.90 0.69 0.96 0.43 0.93 0.53 247 47
14 rf_pca 0.85 0.67 1.00 0.04 0.91 0.08 247 47
17 xgb 0.87 0.61 0.97 0.23 0.92 0.34 247 47
7 gb_smote 0.88 0.58 0.96 0.32 0.92 0.41 247 47
4 gb 0.86 0.57 0.98 0.17 0.91 0.26 247 47
10 lr_pca_smote 0.90 0.55 0.93 0.47 0.91 0.51 247 47
15 rf_pca_smote 0.88 0.54 0.95 0.30 0.91 0.38 247 47
20 xgb_smote 0.88 0.54 0.95 0.30 0.91 0.38 247 47
5 gb_pca 0.86 0.50 0.96 0.19 0.91 0.28 247 47
19 xgb_pca_smote 0.89 0.49 0.92 0.38 0.90 0.43 247 47
1 dt_pca 0.88 0.43 0.92 0.32 0.90 0.37 247 47
6 gb_pca_smote 0.89 0.43 0.90 0.40 0.89 0.42 247 47
16 rf_smote 0.90 0.43 0.87 0.49 0.89 0.46 247 47
3 dt_smote 0.90 0.42 0.87 0.49 0.88 0.45 247 47
0 dt 0.88 0.40 0.90 0.36 0.89 0.38 247 47

REPORT



INTRODUCTION The choice of an employee attrition dataset for data mining is valuable for organizations as it helps identify key factors contributing to turnover. Analyzing attributes like job satisfaction, compensation, work-life balance, and demographics enables pinpointing influential factors. Predictive modeling using historical data allows organizations to forecast attrition and take proactive measures. Additionally, analyzing employee attrition data aids in developing effective retention strategies, tailoring HR policies, programs, and initiatives to address specific areas like job satisfaction. This can improve workplace culture, provide career development opportunities, and revise compensation plans, ultimately enhancing employee retention.

CHALLENGE & CHOICE TO OVERCOME IT Class imbalance is a challenge that can occur in an employee attrition dataset, where one class (attrition) dominates the other, making it difficult to build accurate predictive models. To address this challenge, the Synthetic Minority Over-sampling Technique (SMOTE) is commonly used. SMOTE generates synthetic samples of the minority class by interpolating between neighboring instances. This technique helps balance the class distribution and provides a more representative dataset for model training.

While SMOTE is a widely used technique for addressing class imbalance, it is important to note that it may not always be the optimal choice for every situation. Because of limited time we couldn't proceed to trying all the potentially better methods.

METHODOLOGY



Data quality check: We performed check for missing values in the dataset.

Exploratory Data Analysis (EDA): We examined the shape of the dataset. Also, inspected the data types of each column. Moreover, calculated basic descriptive statistics. Lastly we plotted histograms for numerical features, Creating separate histograms for the 'Attrition' attribute.

Data preprocessing: We identified and dropped irrelevant columns, scaled numerical attributes. Correlation analysis: Then. created a correlation matrix. Plot a heatmap visualization. Identify the highest correlation value and corresponding attribute names.

One-hot encoding: Encoded categorical attributes. Converted the 'Attrition' column to binary values.

Separate features and target variable: Divided the dataset into independent attributes (X) and the target variable (y).

MODEL TRAINING AND EVALUATION



For the sake of completeness we provide a short description of models we trained using the Augmented Data (SMOTE). Iteratevely we trained the same models using 1. SMOTE with PCA,Non-Augmented (Original data), Non-Augmented with PCA.

1.Logistic Regression 2.Decision Tree 3.Random Forest 4.Gradient Boosting 5.XGBoost Classifier 6.AdaBoost

CLUSTERING



Additionally, we performed Correlation analysis, where we calculated the correlation between each attribute and the target variable 'Attrition'. From the above, we selected the 10 most correlated features and draw a hierarchical clustering using complete linkage. The knowledge derived from it can be utilized in feature selection for further experimentation.

Moreover, in an exploratory spirit we have conducted extensive Clustering Analysis for the best attribute pair (searching through numerical attributes only) and clustering algorithm (K-means, AgglomerativeClustering, and GMM) and for each pair of attribut and clustering algorithm we check the following scores: Silhouette, Calinski-Harabasz and Davies-Bouldin. The attribute pairs with the best scores are stored.

For the best attribute pair and clustering algorithm the predicted cluster labels are obtained. Based on the provided in the dataset ground truth labels we calculated the Accuracy and F1 scores.

As you can observe, we obtained very poor results. Something we should be aware of since the dataset is highly imbalanced.

CONCLUSION



From the above table we evaluate the performance of models (for an imbalanced dataset) based on precision_1 = 0.7, recall_1 = 0.49 and f-1score_1 = 0.57. The best performing model is Logistic Regression on Augmented Data (SMOTE). ROCcurve and Confusion matrix is below

In [293]:
# get predicted probabilities for positive class
y_score_lr_smote = lr_smote.predict_proba(X_test)[:, 1]

# calculate false positive rate, true positive rate, and thresholds for ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_score_lr_smote)

# calculate AUC score
roc_auc = auc(fpr, tpr)
# set the figure size
plt.figure(figsize=(8, 6))
# plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Logistic Regression (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
In [294]:
# get predicted labels
y_pred_lr_smote = lr_smote.predict(X_test)

# calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_lr_smote)
# set the figure size
plt.figure(figsize=(8, 6))
# create heatmap of confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()